1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 
  26 #include <sys/param.h>
  27 #include <sys/errno.h>
  28 #include <sys/asm_linkage.h>
  29 #include <sys/vtrace.h>
  30 #include <sys/machthread.h>
  31 #include <sys/clock.h>
  32 #include <sys/asi.h>
  33 #include <sys/fsr.h>
  34 #include <sys/privregs.h>
  35 #include <sys/machasi.h>
  36 #include <sys/niagaraasi.h>
  37 
  38 #if !defined(lint)
  39 #include "assym.h"
  40 #endif  /* lint */
  41 
  42 
  43 /*
  44  * Pseudo-code to aid in understanding the control flow of the
  45  * bcopy/kcopy routine.
  46  *
  47  *      ! WARNING : <Register usage convention>
  48  *      ! In kcopy() the %o5, holds previous error handler and a flag
  49  *      ! LOFAULT_SET (low bits). The %o5 is null in bcopy().
  50  *      ! The %o5 is not available for any other use.
  51  *
  52  * On entry:
  53  *      ! Determine whether to use the FP register version or the
  54  *      ! the leaf routine version depending on the size of the copy.
  55  *      ! Set up error handling accordingly.
  56  *      ! The transition point depends on FP_COPY
  57  *      ! For both versions %o5 is reserved
  58  *
  59  * kcopy():
  60  *      if(length > FP_COPY)
  61  *              go to regular_kcopy
  62  *
  63  *      ! Setup_leaf_rtn_error_handler
  64  *      %o5 = curthread->t_lofault;          ! save existing handler in %o5
  65  *      %o5 |= LOFAULT_SET;                     ! ORed with LOFAULT_SET flag
  66  *      curthread->t_lofault = .sm_copyerr;
  67  *      goto small_bcopy();
  68  *
  69  * regular_kcopy:
  70  *      save_registers()
  71  *      %o5 = curthread->t_lofault;          ! save existing handler in %o5
  72  *      %o5 |= LOFAULT_SET;                     ! ORed with LOFAULT_SET flag
  73  *      curthread->t_lofault = .copyerr;
  74  *      goto do_copy();
  75  *
  76  * bcopy():
  77  *      if(length > FP_COPY)
  78  *              go to regular_bcopy
  79  *
  80  *      ! Setup_leaf_rtn_error_handler
  81  *      %o5 = curthread->t_lofault;          ! save existing handler in %o5
  82  *      curthread->t_lofault = .sm_copyerr;
  83  *      goto small_bcopy();
  84  *
  85  * regular_bcopy:
  86  *      %o5 = curthread->t_lofault;          ! save existing handler in %o5
  87  *      curthread->t_lofault = .copyerr;
  88  *      goto do_copy();
  89  *
  90  * small_bcopy:
  91  *      ! handle copies smaller than FP_COPY
  92  *      restore t_lofault handler
  93  *      exit
  94  *
  95  * do_copy:
  96  *      ! handle copies larger than FP_COPY
  97  *      save fp_regs
  98  *      blockcopy;
  99  *      restore fp_regs
 100  *      restore t_lofault handler if came from kcopy();
 101  *
 102  *
 103  * In leaf lofault handler:
 104  *      curthread->t_lofault = (%o5 & ~LOFAULT_SET);     ! restore old t_lofault
 105  *      return (errno)
 106  *
 107  * In lofault handler:
 108  *      curthread->t_lofault = (%o5 & ~LOFAULT_SET);     ! restore old t_lofault
 109  *      restore fp_regs
 110  *      return (errno)
 111  *
 112  *
 113  *
 114  * For all of bcopy/copyin/copyout the copy logic is specialized according
 115  * to how the src and dst is aligned and how much data needs to be moved.
 116  * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
 117  * 
 118  * N2/RF Flow :
 119  *
 120  * if (count < FP_COPY) {  (584 bytes)
 121  *   set small fault handler (no register window save/restore)
 122  *   if count < SHORTCOPY  (7 bytes)
 123  *      copy bytes; go to short_exit
 124  *   else
 125  *   determine dst alignment, move minimum bytes/halfwords to
 126  *   get dst aligned on long word boundary
 127  *     if( src is on long word boundary ) {
 128  * medlong:                                        src/dst aligned on 8 bytes
 129  *       copy with ldx/stx in 4-way unrolled loop;
 130  *       copy final 0-31 bytes; go to short_exit
 131  *     } else {                                 src/dst not aligned on 8 bytes
 132  *     if src is word aligned, ld/st words in 32-byte chunks
 133  *     if src is half word aligned, ld half, ld word, ld half; pack 
 134  *              into long word, store long words in 32-byte chunks
 135  *     if src is byte aligned, ld byte,half,word parts;  pack into long
 136  *         word, store long words in 32-byte chunks
 137  *     move final 0-31 bytes according to src alignment;  go to short_exit
 138  * short_exit:
 139  *     restore trap handler if needed, retl
 140  * else {                                          More than FP_COPY bytes
 141  *     set fault handler
 142  *     disable kernel preemption
 143  *     save registers, save FP registers if in use
 144  *     move bytes to align destination register on long word boundary
 145  *     if(src is on long word boundary) {          src/dst aligned on 8 bytes
 146  *       align dst on 64 byte boundary;  use 8-way test for each of 8 possible
 147  *       src alignments relative to a 64 byte boundary to select the
 148  *       16-way unrolled loop (128 bytes) to use for
 149  *       block load, fmovd, block-init-store, block-store, fmovd operations
 150  *       then go to remain_stuff.
 151  * remain_stuff: move remaining bytes. go to long_exit
 152  *     } else {
 153  *       setup alignaddr for faligndata instructions
 154  *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
 155  *       src alignments to nearest long word relative to 64 byte boundary to
 156  *       select the 8-way unrolled loop (64 bytes) to use for
 157  *       block load, falign, fmovd, block-store loop
 158  *       (only use block-init-store when src/dst on 8 byte boundaries.)
 159  *       goto unalign_done.
 160  * unalign_done:
 161  *       move remaining bytes for unaligned cases. go to long_exit
 162  * long_exit:
 163  *       restore %gsr, FP regs (either from stack or set to zero),
 164  *       restore trap handler, check for kernel preemption request,
 165  *       handle if needed, ret.
 166  * }
 167  *
 168  * Other platforms include hw_bcopy_limit_[1248] to control the exact
 169  * point where the FP register code is used. On those platforms, the
 170  * FP register code did not leave data in L2 cache, potentially affecting
 171  * performance more than the gain/loss from the algorithm difference.
 172  * For N2/RF, block store places data in the L2 cache, so use or non-use
 173  * of the FP registers has no effect on L2 cache behavior.
 174  * The cost for testing hw_bcopy_limit_* according to different
 175  * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
 176  * were not used. That cost was judged too high relative to the benefits,
 177  * so the hw_bcopy_limit option is omitted from this code.
 178  */
 179 
 180 /*
 181  * Less then or equal this number of bytes we will always copy byte-for-byte
 182  */
 183 #define SMALL_LIMIT     7
 184 
 185 /*
 186  * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
 187  * handler was set
 188  */
 189 #define LOFAULT_SET 2
 190 
 191 /*
 192  * This define is to align data for the unaligned source cases.
 193  * The data1, data2 and data3 is merged into data1 and data2.
 194  * The data3 is preserved for next merge.
 195  */
 196 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)    \
 197         sllx    data1, lshift, data1                            ;\
 198         srlx    data2, rshift, tmp                              ;\
 199         or      data1, tmp, data1                               ;\
 200         sllx    data2, lshift, data2                            ;\
 201         srlx    data3, rshift, tmp                              ;\
 202         or      data2, tmp, data2
 203 /*
 204  * This macro is to align the data. Basically it merges
 205  * data1 and data2 to form double word.
 206  */
 207 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)        \
 208         sllx    data1, lshift, data1                            ;\
 209         srlx    data2, rshift, tmp                              ;\
 210         or      data1, tmp, data1
 211 
 212 #if !defined(NIAGARA_IMPL)
 213 /*
 214  * Flags set in the lower bits of the t_lofault address:
 215  * FPUSED_FLAG: The FP registers were in use and must be restored
 216  * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
 217  * COPY_FLAGS: Both of the above
 218  *
 219  * Other flags:
 220  * KPREEMPT_FLAG: kpreempt needs to be called
 221  */
 222 #define FPUSED_FLAG     1
 223 #define LOFAULT_SET     2
 224 #define COPY_FLAGS      (FPUSED_FLAG | LOFAULT_SET)
 225 #define KPREEMPT_FLAG   4
 226 
 227 #define ALIGN_OFF_1_7                   \
 228         faligndata %d0, %d2, %d48       ;\
 229         faligndata %d2, %d4, %d50       ;\
 230         faligndata %d4, %d6, %d52       ;\
 231         faligndata %d6, %d8, %d54       ;\
 232         faligndata %d8, %d10, %d56      ;\
 233         faligndata %d10, %d12, %d58     ;\
 234         faligndata %d12, %d14, %d60     ;\
 235         faligndata %d14, %d16, %d62
 236 
 237 #define ALIGN_OFF_8_15                  \
 238         faligndata %d2, %d4, %d48       ;\
 239         faligndata %d4, %d6, %d50       ;\
 240         faligndata %d6, %d8, %d52       ;\
 241         faligndata %d8, %d10, %d54      ;\
 242         faligndata %d10, %d12, %d56     ;\
 243         faligndata %d12, %d14, %d58     ;\
 244         faligndata %d14, %d16, %d60     ;\
 245         faligndata %d16, %d18, %d62
 246 
 247 #define ALIGN_OFF_16_23                 \
 248         faligndata %d4, %d6, %d48       ;\
 249         faligndata %d6, %d8, %d50       ;\
 250         faligndata %d8, %d10, %d52      ;\
 251         faligndata %d10, %d12, %d54     ;\
 252         faligndata %d12, %d14, %d56     ;\
 253         faligndata %d14, %d16, %d58     ;\
 254         faligndata %d16, %d18, %d60     ;\
 255         faligndata %d18, %d20, %d62
 256 
 257 #define ALIGN_OFF_24_31                 \
 258         faligndata %d6, %d8, %d48       ;\
 259         faligndata %d8, %d10, %d50      ;\
 260         faligndata %d10, %d12, %d52     ;\
 261         faligndata %d12, %d14, %d54     ;\
 262         faligndata %d14, %d16, %d56     ;\
 263         faligndata %d16, %d18, %d58     ;\
 264         faligndata %d18, %d20, %d60     ;\
 265         faligndata %d20, %d22, %d62
 266 
 267 #define ALIGN_OFF_32_39                 \
 268         faligndata %d8, %d10, %d48      ;\
 269         faligndata %d10, %d12, %d50     ;\
 270         faligndata %d12, %d14, %d52     ;\
 271         faligndata %d14, %d16, %d54     ;\
 272         faligndata %d16, %d18, %d56     ;\
 273         faligndata %d18, %d20, %d58     ;\
 274         faligndata %d20, %d22, %d60     ;\
 275         faligndata %d22, %d24, %d62
 276 
 277 #define ALIGN_OFF_40_47                 \
 278         faligndata %d10, %d12, %d48     ;\
 279         faligndata %d12, %d14, %d50     ;\
 280         faligndata %d14, %d16, %d52     ;\
 281         faligndata %d16, %d18, %d54     ;\
 282         faligndata %d18, %d20, %d56     ;\
 283         faligndata %d20, %d22, %d58     ;\
 284         faligndata %d22, %d24, %d60     ;\
 285         faligndata %d24, %d26, %d62
 286 
 287 #define ALIGN_OFF_48_55                 \
 288         faligndata %d12, %d14, %d48     ;\
 289         faligndata %d14, %d16, %d50     ;\
 290         faligndata %d16, %d18, %d52     ;\
 291         faligndata %d18, %d20, %d54     ;\
 292         faligndata %d20, %d22, %d56     ;\
 293         faligndata %d22, %d24, %d58     ;\
 294         faligndata %d24, %d26, %d60     ;\
 295         faligndata %d26, %d28, %d62
 296 
 297 #define ALIGN_OFF_56_63                 \
 298         faligndata %d14, %d16, %d48     ;\
 299         faligndata %d16, %d18, %d50     ;\
 300         faligndata %d18, %d20, %d52     ;\
 301         faligndata %d20, %d22, %d54     ;\
 302         faligndata %d22, %d24, %d56     ;\
 303         faligndata %d24, %d26, %d58     ;\
 304         faligndata %d26, %d28, %d60     ;\
 305         faligndata %d28, %d30, %d62
 306 
 307 /*
 308  * FP_COPY indicates the minimum number of bytes needed
 309  * to justify using FP/VIS-accelerated memory operations.
 310  * The FPBLK code assumes a minimum number of bytes are available
 311  * to be moved on entry.  Check that code carefully before
 312  * reducing FP_COPY below 256.
 313  */
 314 #define FP_COPY                 584
 315 #define SHORTCOPY               7
 316 #define ASI_STBI_P              ASI_BLK_INIT_ST_QUAD_LDD_P
 317 #define ASI_STBI_AIUS           ASI_BLK_INIT_QUAD_LDD_AIUS
 318 #define CACHE_LINE              64
 319 #define VIS_BLOCKSIZE           64
 320 
 321 /*
 322  * Size of stack frame in order to accomodate a 64-byte aligned
 323  * floating-point register save area and 2 64-bit temp locations.
 324  * All copy functions use three quadrants of fp registers; to assure a
 325  * block-aligned three block buffer in which to save we must reserve
 326  * four blocks on stack.
 327  *
 328  *    _______________________________________ <-- %fp + STACK_BIAS
 329  *    | We may need to preserve 3 quadrants |
 330  *    | of fp regs, but since we do so with |
 331  *    | BST/BLD we need room in which to    |
 332  *    | align to VIS_BLOCKSIZE bytes.  So   |
 333  *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 334  *    |-------------------------------------|
 335  *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 336  *    |-------------------------------------|
 337  *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 338  *    ---------------------------------------
 339  */
 340 #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
 341 #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 4)
 342 #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 3) + 1)
 343 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
 344 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
 345 
 346 /*
 347  * In FP copies if we do not have preserved data to restore over
 348  * the fp regs we used then we must zero those regs to avoid
 349  * exposing portions of the data to later threads (data security).
 350  */
 351 #define FZERO                           \
 352         fzero   %f0                     ;\
 353         fzero   %f2                     ;\
 354         faddd   %f0, %f2, %f4           ;\
 355         fmuld   %f0, %f2, %f6           ;\
 356         faddd   %f0, %f2, %f8           ;\
 357         fmuld   %f0, %f2, %f10          ;\
 358         faddd   %f0, %f2, %f12          ;\
 359         fmuld   %f0, %f2, %f14          ;\
 360         faddd   %f0, %f2, %f16          ;\
 361         fmuld   %f0, %f2, %f18          ;\
 362         faddd   %f0, %f2, %f20          ;\
 363         fmuld   %f0, %f2, %f22          ;\
 364         faddd   %f0, %f2, %f24          ;\
 365         fmuld   %f0, %f2, %f26          ;\
 366         faddd   %f0, %f2, %f28          ;\
 367         fmuld   %f0, %f2, %f30          ;\
 368         faddd   %f0, %f2, %f48          ;\
 369         fmuld   %f0, %f2, %f50          ;\
 370         faddd   %f0, %f2, %f52          ;\
 371         fmuld   %f0, %f2, %f54          ;\
 372         faddd   %f0, %f2, %f56          ;\
 373         fmuld   %f0, %f2, %f58          ;\
 374         faddd   %f0, %f2, %f60          ;\
 375         fmuld   %f0, %f2, %f62
 376 
 377 #if !defined(lint)
 378 
 379 /*
 380  * Macros to save and restore fp registers to/from the stack.
 381  * Used to save and restore in-use fp registers when we want to use FP.
 382  */
 383 #define BST_FP_TOSTACK(tmp1)                                    \
 384         /* membar #Sync */                                      ;\
 385         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 386         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 387         stda    %f0, [tmp1]ASI_BLK_P                            ;\
 388         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 389         stda    %f16, [tmp1]ASI_BLK_P                           ;\
 390         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 391         stda    %f48, [tmp1]ASI_BLK_P                           ;\
 392         membar  #Sync
 393 
 394 #define BLD_FP_FROMSTACK(tmp1)                                  \
 395         /* membar #Sync - provided at copy completion */        ;\
 396         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 397         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 398         ldda    [tmp1]ASI_BLK_P, %f0                            ;\
 399         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 400         ldda    [tmp1]ASI_BLK_P, %f16                           ;\
 401         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 402         ldda    [tmp1]ASI_BLK_P, %f48                           ;\
 403         membar  #Sync
 404 #endif  /* NIAGARA_IMPL */
 405 
 406 #endif  /* lint */
 407 /*
 408  * Copy a block of storage, returning an error code if `from' or
 409  * `to' takes a kernel pagefault which cannot be resolved.
 410  * Returns errno value on pagefault error, 0 if all ok
 411  */
 412 
 413 #if defined(lint)
 414 
 415 /* ARGSUSED */
 416 int
 417 kcopy(const void *from, void *to, size_t count)
 418 { return(0); }
 419 
 420 #else   /* lint */
 421 
 422         .seg    ".text"
 423         .align  4
 424 
 425         ENTRY(kcopy)
 426 #if !defined(NIAGARA_IMPL)
 427         cmp     %o2, FP_COPY                    ! check for small copy/leaf case
 428         bgt,pt  %ncc, .kcopy_more               !
 429         nop
 430 .kcopy_small:                                   ! setup error handler
 431         sethi   %hi(.sm_copyerr), %o4
 432         or      %o4, %lo(.sm_copyerr), %o4      ! .sm_copyerr is lofault value
 433         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
 434         ! Note that we carefully do *not* flag the setting of
 435         ! t_lofault.
 436         membar  #Sync                           ! sync error barrier
 437         b       .sm_do_copy                     ! common code
 438         stn     %o4, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 439 
 440 
 441 .kcopy_more:
 442         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 443         sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
 444         or      %l7, %lo(.copyerr), %l7
 445         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
 446         ! Note that we carefully do *not* flag the setting of
 447         ! t_lofault.
 448         membar  #Sync                           ! sync error barrier
 449         b       .do_copy                        ! common code
 450         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 451 
 452 /*
 453  * We got here because of a fault during a small kcopy or bcopy.
 454  * if a fault handler existed when bcopy was called.
 455  * No floating point registers are used by the small copies.
 456  * Small copies are from a leaf routine
 457  * Errno value is in %g1.
 458  */
 459 .sm_copyerr:
 460         ! The kcopy will always set a t_lofault handler. If it fires,
 461         ! we're expected to just return the error code and not to
 462         ! invoke any existing error handler. As far as bcopy is concerned,
 463         ! we only set t_lofault if there was an existing lofault handler.
 464         ! In that case we're expected to invoke the previously existing
 465         ! handler after resetting the t_lofault value.
 466         btst    LOFAULT_SET, %o5
 467         membar  #Sync                           ! sync error barrier
 468         andn    %o5, LOFAULT_SET, %o5           ! clear fault flag
 469         bnz,pn  %ncc, 3f
 470         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 471         retl
 472         mov     %g1, %o0
 473 3:
 474         ! We're here via bcopy. There must have been an error handler
 475         ! in place otherwise we would have died a nasty death already.
 476         jmp     %o5                             ! goto real handler
 477         mov     %g0, %o0
 478 /*
 479  *  end of .sm_copyerr
 480  */
 481 
 482 /*
 483  * We got here because of a fault during kcopy or bcopy if a fault
 484  * handler existed when bcopy was called.
 485  * stack and fp registers need to be restored
 486  * Errno value is in %g1.
 487  */
 488 .copyerr:
 489         sethi   %hi(.copyerr2), %l1
 490         or      %l1, %lo(.copyerr2), %l1
 491         membar  #Sync                           ! sync error barrier
 492         stn     %l1, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 493         btst    FPUSED_FLAG, %o5
 494         bz,pt   %xcc, 1f
 495         and     %o5, LOFAULT_SET, %l1   ! copy flag to %l1
 496 
 497         membar  #Sync                           ! sync error barrier
 498         wr      %l5, 0, %gsr
 499         btst    FPRS_FEF, %g5
 500         bz,pt   %icc, 4f
 501         nop
 502         ! restore fpregs from stack
 503         BLD_FP_FROMSTACK(%o2)
 504         ba,pt   %ncc, 2f
 505         wr      %g5, 0, %fprs           ! restore fprs
 506 4:
 507         FZERO
 508         wr      %g5, 0, %fprs           ! restore fprs
 509 2:
 510         ldn     [THREAD_REG + T_LWP], %o2
 511         brnz,pt %o2, 1f
 512         nop
 513 
 514         ldsb    [THREAD_REG + T_PREEMPT], %l0
 515         deccc   %l0
 516         bnz,pn  %ncc, 1f
 517         stb     %l0, [THREAD_REG + T_PREEMPT]
 518 
 519         ! Check for a kernel preemption request
 520         ldn     [THREAD_REG + T_CPU], %l0
 521         ldub    [%l0 + CPU_KPRUNRUN], %l0
 522         brnz,a,pt       %l0, 1f ! Need to call kpreempt?
 523         or      %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
 524 
 525         ! The kcopy will always set a t_lofault handler. If it fires,
 526         ! we're expected to just return the error code and not to
 527         ! invoke any existing error handler. As far as bcopy is concerned,
 528         ! we only set t_lofault if there was an existing lofault handler.
 529         ! In that case we're expected to invoke the previously existing
 530         ! handler after resetting the t_lofault value.
 531 1:
 532         andn    %o5, COPY_FLAGS, %o5    ! remove flags from lofault address
 533         membar  #Sync                           ! sync error barrier
 534         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 535 
 536         ! call kpreempt if necessary
 537         btst    KPREEMPT_FLAG, %l1
 538         bz,pt   %icc, 2f
 539         nop
 540         call    kpreempt
 541         rdpr    %pil, %o0       ! pass %pil
 542 2:
 543         btst    LOFAULT_SET, %l1
 544         bnz,pn  %ncc, 3f
 545         nop
 546         ret
 547         restore %g1, 0, %o0
 548 3:
 549         ! We're here via bcopy. There must have been an error handler
 550         ! in place otherwise we would have died a nasty death already.
 551         jmp     %o5                             ! goto real handler
 552         restore %g0, 0, %o0                     ! dispose of copy window
 553 
 554 /*
 555  * We got here because of a fault in .copyerr.  We can't safely restore fp
 556  * state, so we panic.
 557  */
 558 fp_panic_msg:
 559         .asciz  "Unable to restore fp state after copy operation"
 560 
 561         .align  4
 562 .copyerr2:
 563         set     fp_panic_msg, %o0
 564         call    panic
 565         nop
 566 /*
 567  *  end of .copyerr
 568  */
 569 
 570 #else   /* NIAGARA_IMPL */
 571         save    %sp, -SA(MINFRAME), %sp
 572         set     .copyerr, %l7                   ! copyerr is lofault value
 573         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
 574         or      %o5, LOFAULT_SET, %o5
 575         membar  #Sync                           ! sync error barrier
 576         b       .do_copy                        ! common code
 577         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 578 
 579 /*
 580  * We got here because of a fault during kcopy.
 581  * Errno value is in %g1.
 582  */
 583 .copyerr:
 584         ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
 585         ! into %o5 to indicate it has set t_lofault handler. Need to clear
 586         ! LOFAULT_SET flag before restoring the error handler.
 587         andn    %o5, LOFAULT_SET, %o5
 588         membar  #Sync                           ! sync error barrier
 589         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 590         ret
 591         restore %g1, 0, %o0
 592 #endif  /* NIAGARA_IMPL */
 593 
 594         SET_SIZE(kcopy)
 595 #endif  /* lint */
 596 
 597 
 598 /*
 599  * Copy a block of storage - must not overlap (from + len <= to).
 600  */
 601 #if defined(lint)
 602 
 603 /* ARGSUSED */
 604 void
 605 bcopy(const void *from, void *to, size_t count)
 606 {}
 607 
 608 #else   /* lint */
 609 
 610         ENTRY(bcopy)
 611 #if !defined(NIAGARA_IMPL)
 612         cmp     %o2, FP_COPY                    ! check for small copy/leaf case
 613         bgt,pt  %ncc, .bcopy_more               !
 614         nop
 615 .bcopy_small:                                   ! setup error handler
 616         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
 617         tst     %o5
 618         bz,pt   %icc, .sm_do_copy
 619         sethi   %hi(.sm_copyerr), %o4
 620         or      %o4, %lo(.sm_copyerr), %o4      ! .sm_copyerr is lofault value
 621         membar  #Sync                           ! sync error barrier
 622         stn     %o4, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 623         or      %o5, LOFAULT_SET, %o5           ! Error should trampoline
 624 .sm_do_copy:
 625         mov     %o0, %g1                ! save %o0
 626         cmp     %o2, SHORTCOPY          ! make sure there is enough to align
 627         ble,pt  %ncc, .bc_smallest
 628         andcc   %o1, 0x7, %o3           ! is dest long aligned
 629         bnz,pn  %ncc, .bc_align
 630         andcc   %o1, 1, %o3             ! is dest byte aligned
 631 
 632 ! Destination is long word aligned
 633 .bc_al_src:
 634         andcc   %o0, 7, %o3
 635         brnz,pt %o3, .bc_src_dst_unal8
 636         nop
 637 /*
 638  * Special case for handling when src and dest are both long word aligned
 639  * and total data to move is less than FP_COPY bytes
 640  * Also handles finish up for large block moves, so may be less than 32 bytes
 641  */
 642 .bc_medlong:
 643         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 644         ble,pt  %ncc, .bc_medl31
 645         nop
 646 .bc_medl32:
 647         ldx     [%o0], %o4              ! move 32 bytes
 648         subcc   %o2, 32, %o2            ! decrement length count by 32
 649         stx     %o4, [%o1]
 650         ldx     [%o0+8], %o4
 651         stx     %o4, [%o1+8]
 652         ldx     [%o0+16], %o4
 653         add     %o0, 32, %o0            ! increase src ptr by 32
 654         stx     %o4, [%o1+16]
 655         ldx     [%o0-8], %o4
 656         add     %o1, 32, %o1            ! increase dst ptr by 32
 657         bgu,pt  %ncc, .bc_medl32        ! repeat if at least 32 bytes left
 658         stx     %o4, [%o1-8]
 659 .bc_medl31:
 660         addcc   %o2, 24, %o2            ! adjust count to be off by 7
 661         ble,pt  %ncc, .bc_medl7         ! skip if 7 or fewer bytes left
 662         nop
 663 .bc_medl8:
 664         ldx     [%o0], %o4              ! move 8 bytes
 665         add     %o0, 8, %o0             ! increase src ptr by 8
 666         subcc   %o2, 8, %o2             ! decrease count by 8
 667         add     %o1, 8, %o1             ! increase dst ptr by 8
 668         bgu,pt  %ncc, .bc_medl8
 669         stx     %o4, [%o1-8]
 670 .bc_medl7:
 671         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
 672         bnz,pt  %ncc, .bc_small4        ! do final bytes if not finished
 673 
 674 .bc_smallx:                             ! finish up and exit
 675         tst     %o5
 676         bz,pt   %ncc, .bc_sm_done
 677         andn    %o5, COPY_FLAGS, %o5    ! remove flags from lofault address
 678         membar  #Sync                   ! sync error barrier
 679         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 680 .bc_sm_done:
 681         retl
 682         mov     %g0, %o0
 683 
 684 .bc_small4:
 685         cmp     %o2, 4
 686         blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
 687         nop                             !
 688         ld      [%o0], %o4              ! move 4 bytes
 689         add     %o0, 4, %o0             ! increase src ptr by 4
 690         add     %o1, 4, %o1             ! increase dst ptr by 4
 691         subcc   %o2, 4, %o2             ! decrease count by 4
 692         bz,pt   %ncc, .bc_smallx
 693         stw     %o4, [%o1-4]
 694 
 695 .bc_small3x:                            ! Exactly 1, 2, or 3 bytes remain
 696         subcc   %o2, 1, %o2             ! reduce count for cc test
 697         ldub    [%o0], %o4              ! load one byte
 698         bz,pt   %ncc, .bc_smallx
 699         stb     %o4, [%o1]              ! store one byte
 700         ldub    [%o0+1], %o4            ! load second byte
 701         subcc   %o2, 1, %o2
 702         bz,pt   %ncc, .bc_smallx
 703         stb     %o4, [%o1+1]            ! store second byte
 704         ldub    [%o0+2], %o4            ! load third byte
 705         ba      .bc_smallx
 706         stb     %o4, [%o1+2]            ! store third byte
 707 
 708 .bc_smallest:                           ! 7 or fewer bytes remain
 709         tst     %o2
 710         bz,pt   %ncc, .bc_smallx
 711         cmp     %o2, 4
 712         blt,pt  %ncc, .bc_small3x
 713         nop
 714         ldub    [%o0], %o4              ! read byte
 715         subcc   %o2, 4, %o2             ! reduce count by 4
 716         stb     %o4, [%o1]              ! write byte
 717         ldub    [%o0+1], %o4            ! repeat for total of 4 bytes
 718         add     %o0, 4, %o0             ! advance src by 4
 719         stb     %o4, [%o1+1]
 720         ldub    [%o0-2], %o4
 721         add     %o1, 4, %o1             ! advance dst by 4
 722         stb     %o4, [%o1-2]
 723         ldub    [%o0-1], %o4
 724         bnz,pt  %ncc, .bc_small3x
 725         stb     %o4, [%o1-1]
 726         ba      .bc_smallx
 727         nop
 728 
 729 /*
 730  * Align destination to long word boundary
 731  */
 732 .bc_align:                              ! byte align test in prior branch delay
 733         bnz,pt  %ncc, .bc_al_d1
 734 .bc_al_d1f:                             ! dest is now half word aligned
 735         andcc   %o1, 2, %o3
 736         bnz,pt  %ncc, .bc_al_d2
 737 .bc_al_d2f:                             ! dest is now word aligned
 738         andcc   %o1, 4, %o3             ! is dest longword aligned?
 739         bz,pt   %ncc, .bc_al_src
 740         nop
 741 .bc_al_d4:                              ! dest is word aligned;  src is unknown
 742         ldub    [%o0], %o4              ! move a word (src align unknown)
 743         ldub    [%o0+1], %o3
 744         sll     %o4, 24, %o4            ! position
 745         sll     %o3, 16, %o3            ! position
 746         or      %o4, %o3, %o3           ! merge
 747         ldub    [%o0+2], %o4
 748         sll     %o4, 8, %o4             ! position
 749         or      %o4, %o3, %o3           ! merge
 750         ldub    [%o0+3], %o4
 751         or      %o4, %o3, %o4           ! merge
 752         stw     %o4,[%o1]               ! store four bytes
 753         add     %o0, 4, %o0             ! adjust src by 4
 754         add     %o1, 4, %o1             ! adjust dest by 4
 755         sub     %o2, 4, %o2             ! adjust count by 4
 756         andcc   %o0, 7, %o3             ! check for src long word alignment
 757         brz,pt  %o3, .bc_medlong
 758 .bc_src_dst_unal8:
 759         ! dst is 8-byte aligned, src is not
 760         ! Size is less than FP_COPY
 761         ! Following code is to select for alignment
 762         andcc   %o0, 0x3, %o3           ! test word alignment
 763         bz,pt   %ncc, .bc_medword
 764         nop
 765         andcc   %o0, 0x1, %o3           ! test halfword alignment
 766         bnz,pt  %ncc, .bc_med_byte      ! go to byte move if not halfword
 767         andcc   %o0, 0x2, %o3           ! test which byte alignment
 768         ba      .bc_medhalf
 769         nop
 770 .bc_al_d1:                              ! align dest to half word
 771         ldub    [%o0], %o4              ! move a byte
 772         add     %o0, 1, %o0
 773         stb     %o4, [%o1]
 774         add     %o1, 1, %o1
 775         andcc   %o1, 2, %o3
 776         bz,pt   %ncc, .bc_al_d2f
 777         sub     %o2, 1, %o2
 778 .bc_al_d2:                              ! align dest to word
 779         ldub    [%o0], %o4              ! move a half-word (src align unknown)
 780         ldub    [%o0+1], %o3
 781         sll     %o4, 8, %o4             ! position
 782         or      %o4, %o3, %o4           ! merge
 783         sth     %o4, [%o1]
 784         add     %o0, 2, %o0
 785         add     %o1, 2, %o1
 786         andcc   %o1, 4, %o3             ! is dest longword aligned?
 787         bz,pt   %ncc, .bc_al_src
 788         sub     %o2, 2, %o2
 789         ba      .bc_al_d4
 790         nop
 791 /*
 792  * Handle all cases where src and dest are aligned on word
 793  * boundaries. Use unrolled loops for better performance.
 794  * This option wins over standard large data move when 
 795  * source and destination is in cache for medium
 796  * to short data moves.
 797  */
 798 .bc_medword:
 799         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 800         ble,pt  %ncc, .bc_medw31
 801         nop
 802 .bc_medw32:
 803         ld      [%o0], %o4              ! move a block of 32 bytes
 804         stw     %o4, [%o1]
 805         ld      [%o0+4], %o4
 806         stw     %o4, [%o1+4]
 807         ld      [%o0+8], %o4
 808         stw     %o4, [%o1+8]
 809         ld      [%o0+12], %o4
 810         stw     %o4, [%o1+12]
 811         ld      [%o0+16], %o4
 812         stw     %o4, [%o1+16]
 813         ld      [%o0+20], %o4
 814         subcc   %o2, 32, %o2            ! decrement length count
 815         stw     %o4, [%o1+20]
 816         ld      [%o0+24], %o4
 817         add     %o0, 32, %o0            ! increase src ptr by 32
 818         stw     %o4, [%o1+24]
 819         ld      [%o0-4], %o4
 820         add     %o1, 32, %o1            ! increase dst ptr by 32
 821         bgu,pt  %ncc, .bc_medw32        ! repeat if at least 32 bytes left
 822         stw     %o4, [%o1-4]
 823 .bc_medw31:
 824         addcc   %o2, 24, %o2            ! adjust count to be off by 7
 825         ble,pt  %ncc, .bc_medw7         ! skip if 7 or fewer bytes left
 826         nop                             !
 827 .bc_medw15:
 828         ld      [%o0], %o4              ! move a block of 8 bytes
 829         subcc   %o2, 8, %o2             ! decrement length count
 830         stw     %o4, [%o1]
 831         add     %o0, 8, %o0             ! increase src ptr by 8
 832         ld      [%o0-4], %o4
 833         add     %o1, 8, %o1             ! increase dst ptr by 8
 834         bgu,pt  %ncc, .bc_medw15
 835         stw     %o4, [%o1-4]
 836 .bc_medw7:
 837         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
 838         bz,pt   %ncc, .bc_smallx        ! exit if finished
 839         cmp     %o2, 4
 840         blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
 841         nop                             !
 842         ld      [%o0], %o4              ! move 4 bytes
 843         add     %o0, 4, %o0             ! increase src ptr by 4
 844         add     %o1, 4, %o1             ! increase dst ptr by 4
 845         subcc   %o2, 4, %o2             ! decrease count by 4
 846         bnz     .bc_small3x
 847         stw     %o4, [%o1-4]
 848         ba      .bc_smallx
 849         nop
 850 
 851 .bc_medhalf:
 852         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 853         ble,pt  %ncc, .bc_medh31
 854         nop
 855 .bc_medh32:                             ! load and store block of 32 bytes
 856         subcc   %o2, 32, %o2            ! decrement length count
 857 
 858         lduh    [%o0], %o4              ! move 32 bytes
 859         lduw    [%o0+2], %o3
 860         sllx    %o4, 48, %o4
 861         sllx    %o3, 16, %o3
 862         or      %o4, %o3, %o3
 863         lduh    [%o0+6], %o4
 864         or      %o4, %o3, %o4
 865         stx     %o4, [%o1]
 866 
 867         lduh    [%o0+8], %o4
 868         lduw    [%o0+10], %o3
 869         sllx    %o4, 48, %o4
 870         sllx    %o3, 16, %o3
 871         or      %o4, %o3, %o3
 872         lduh    [%o0+14], %o4
 873         or      %o4, %o3, %o4
 874         stx     %o4, [%o1+8]
 875 
 876         lduh    [%o0+16], %o4
 877         lduw    [%o0+18], %o3
 878         sllx    %o4, 48, %o4
 879         sllx    %o3, 16, %o3
 880         or      %o4, %o3, %o3
 881         lduh    [%o0+22], %o4
 882         or      %o4, %o3, %o4
 883         stx     %o4, [%o1+16]
 884 
 885         add     %o0, 32, %o0            ! increase src ptr by 32
 886         add     %o1, 32, %o1            ! increase dst ptr by 32
 887 
 888         lduh    [%o0-8], %o4
 889         lduw    [%o0-6], %o3
 890         sllx    %o4, 48, %o4
 891         sllx    %o3, 16, %o3
 892         or      %o4, %o3, %o3
 893         lduh    [%o0-2], %o4
 894         or      %o3, %o4, %o4
 895         bgu,pt  %ncc, .bc_medh32        ! repeat if at least 32 bytes left
 896         stx     %o4, [%o1-8]
 897 
 898 .bc_medh31:
 899         addcc   %o2, 24, %o2            ! adjust count to be off by 7
 900         ble,pt  %ncc, .bc_medh7         ! skip if 7 or fewer bytes left
 901         nop                             !
 902 .bc_medh15:
 903         lduh    [%o0], %o4              ! move 16 bytes
 904         subcc   %o2, 8, %o2             ! decrement length count
 905         lduw    [%o0+2], %o3
 906         sllx    %o4, 48, %o4
 907         sllx    %o3, 16, %o3
 908         or      %o4, %o3, %o3
 909         add     %o1, 8, %o1             ! increase dst ptr by 8
 910         lduh    [%o0+6], %o4
 911         add     %o0, 8, %o0             ! increase src ptr by 8
 912         or      %o4, %o3, %o4
 913         bgu,pt  %ncc, .bc_medh15
 914         stx     %o4, [%o1-8]
 915 .bc_medh7:
 916         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
 917         bz,pt   %ncc, .bc_smallx        ! exit if finished
 918         cmp     %o2, 4
 919         blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
 920         nop                             !
 921         lduh    [%o0], %o4
 922         sll     %o4, 16, %o4
 923         lduh    [%o0+2], %o3
 924         or      %o3, %o4, %o4
 925         subcc   %o2, 4, %o2
 926         add     %o0, 4, %o0
 927         add     %o1, 4, %o1
 928         bnz     .bc_small3x
 929         stw     %o4, [%o1-4]
 930         ba      .bc_smallx
 931         nop
 932 
 933         .align 16
 934 .bc_med_byte:
 935         bnz,pt  %ncc, .bc_medbh32a      ! go to correct byte move
 936         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 937         ble,pt  %ncc, .bc_medb31
 938         nop
 939 .bc_medb32:                             ! Alignment 1 or 5
 940         subcc   %o2, 32, %o2            ! decrement length count
 941 
 942         ldub    [%o0], %o4              ! load and store a block of 32 bytes
 943         sllx    %o4, 56, %o3
 944         lduh    [%o0+1], %o4
 945         sllx    %o4, 40, %o4
 946         or      %o4, %o3, %o3
 947         lduw    [%o0+3], %o4
 948         sllx    %o4, 8, %o4
 949         or      %o4, %o3, %o3
 950         ldub    [%o0+7], %o4
 951         or      %o4, %o3, %o4
 952         stx     %o4, [%o1]
 953 
 954         ldub    [%o0+8], %o4
 955         sllx    %o4, 56, %o3
 956         lduh    [%o0+9], %o4
 957         sllx    %o4, 40, %o4
 958         or      %o4, %o3, %o3
 959         lduw    [%o0+11], %o4
 960         sllx    %o4, 8, %o4
 961         or      %o4, %o3, %o3
 962         ldub    [%o0+15], %o4
 963         or      %o4, %o3, %o4
 964         stx     %o4, [%o1+8]
 965 
 966         ldub    [%o0+16], %o4
 967         sllx    %o4, 56, %o3
 968         lduh    [%o0+17], %o4
 969         sllx    %o4, 40, %o4
 970         or      %o4, %o3, %o3
 971         lduw    [%o0+19], %o4
 972         sllx    %o4, 8, %o4
 973         or      %o4, %o3, %o3
 974         ldub    [%o0+23], %o4
 975         or      %o4, %o3, %o4
 976         stx     %o4, [%o1+16]
 977 
 978         add     %o0, 32, %o0            ! increase src ptr by 32
 979         add     %o1, 32, %o1            ! increase dst ptr by 32
 980 
 981         ldub    [%o0-8], %o4
 982         sllx    %o4, 56, %o3
 983         lduh    [%o0-7], %o4
 984         sllx    %o4, 40, %o4
 985         or      %o4, %o3, %o3
 986         lduw    [%o0-5], %o4
 987         sllx    %o4, 8, %o4
 988         or      %o4, %o3, %o3
 989         ldub    [%o0-1], %o4
 990         or      %o4, %o3, %o4
 991         bgu,pt  %ncc, .bc_medb32        ! repeat if at least 32 bytes left
 992         stx     %o4, [%o1-8]
 993 
 994 .bc_medb31:                             ! 31 or fewer bytes remaining
 995         addcc   %o2, 24, %o2            ! adjust count to be off by 7
 996         ble,pt  %ncc, .bc_medb7         ! skip if 7 or fewer bytes left
 997         nop                             !
 998 .bc_medb15:
 999 
1000         ldub    [%o0], %o4              ! load and store a block of 8 bytes
1001         subcc   %o2, 8, %o2             ! decrement length count
1002         sllx    %o4, 56, %o3
1003         lduh    [%o0+1], %o4
1004         sllx    %o4, 40, %o4
1005         or      %o4, %o3, %o3
1006         lduw    [%o0+3], %o4
1007         add     %o1, 8, %o1             ! increase dst ptr by 16
1008         sllx    %o4, 8, %o4
1009         or      %o4, %o3, %o3
1010         ldub    [%o0+7], %o4
1011         add     %o0, 8, %o0             ! increase src ptr by 16
1012         or      %o4, %o3, %o4
1013         bgu,pt  %ncc, .bc_medb15
1014         stx     %o4, [%o1-8]
1015 .bc_medb7:
1016         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
1017         bz,pt   %ncc, .bc_smallx        ! exit if finished
1018         cmp     %o2, 4
1019         blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
1020         nop                             !
1021         ldub    [%o0], %o4              ! move 4 bytes
1022         sll     %o4, 24, %o3
1023         lduh    [%o0+1], %o4
1024         sll     %o4, 8, %o4
1025         or      %o4, %o3, %o3
1026         ldub    [%o0+3], %o4
1027         or      %o4, %o3, %o4
1028         subcc   %o2, 4, %o2
1029         add     %o0, 4, %o0
1030         add     %o1, 4, %o1
1031         bnz     .bc_small3x
1032         stw     %o4, [%o1-4]
1033         ba      .bc_smallx
1034         nop
1035 
1036         .align 16
1037 .bc_medbh32a:                           ! Alignment 3 or 7
1038         ble,pt  %ncc, .bc_medbh31
1039         nop
1040 .bc_medbh32:                            ! Alignment 3 or 7
1041         subcc   %o2, 32, %o2            ! decrement length count
1042 
1043         ldub    [%o0], %o4              ! load and store a block of 32 bytes
1044         sllx    %o4, 56, %o3
1045         lduw    [%o0+1], %o4
1046         sllx    %o4, 24, %o4
1047         or      %o4, %o3, %o3
1048         lduh    [%o0+5], %o4
1049         sllx    %o4, 8, %o4
1050         or      %o4, %o3, %o3
1051         ldub    [%o0+7], %o4
1052         or      %o4, %o3, %o4
1053         stx     %o4, [%o1]
1054 
1055         ldub    [%o0+8], %o4
1056         sllx    %o4, 56, %o3
1057         lduw    [%o0+9], %o4
1058         sllx    %o4, 24, %o4
1059         or      %o4, %o3, %o3
1060         lduh    [%o0+13], %o4
1061         sllx    %o4, 8, %o4
1062         or      %o4, %o3, %o3
1063         ldub    [%o0+15], %o4
1064         or      %o4, %o3, %o4
1065         stx     %o4, [%o1+8]
1066 
1067         ldub    [%o0+16], %o4
1068         sllx    %o4, 56, %o3
1069         lduw    [%o0+17], %o4
1070         sllx    %o4, 24, %o4
1071         or      %o4, %o3, %o3
1072         lduh    [%o0+21], %o4
1073         sllx    %o4, 8, %o4
1074         or      %o4, %o3, %o3
1075         ldub    [%o0+23], %o4
1076         or      %o4, %o3, %o4
1077         stx     %o4, [%o1+16]
1078 
1079         add     %o0, 32, %o0            ! increase src ptr by 32
1080         add     %o1, 32, %o1            ! increase dst ptr by 32
1081 
1082         ldub    [%o0-8], %o4
1083         sllx    %o4, 56, %o3
1084         lduw    [%o0-7], %o4
1085         sllx    %o4, 24, %o4
1086         or      %o4, %o3, %o3
1087         lduh    [%o0-3], %o4
1088         sllx    %o4, 8, %o4
1089         or      %o4, %o3, %o3
1090         ldub    [%o0-1], %o4
1091         or      %o4, %o3, %o4
1092         bgu,pt  %ncc, .bc_medbh32       ! repeat if at least 32 bytes left
1093         stx     %o4, [%o1-8]
1094 
1095 .bc_medbh31:
1096         addcc   %o2, 24, %o2            ! adjust count to be off by 7
1097         ble,pt  %ncc, .bc_medb7         ! skip if 7 or fewer bytes left
1098         nop                             !
1099 .bc_medbh15:
1100         ldub    [%o0], %o4              ! load and store a block of 8 bytes
1101         sllx    %o4, 56, %o3
1102         lduw    [%o0+1], %o4
1103         sllx    %o4, 24, %o4
1104         or      %o4, %o3, %o3
1105         lduh    [%o0+5], %o4
1106         sllx    %o4, 8, %o4
1107         or      %o4, %o3, %o3
1108         ldub    [%o0+7], %o4
1109         or      %o4, %o3, %o4
1110         stx     %o4, [%o1]
1111         subcc   %o2, 8, %o2             ! decrement length count
1112         add     %o1, 8, %o1             ! increase dst ptr by 8
1113         add     %o0, 8, %o0             ! increase src ptr by 8
1114         bgu,pt  %ncc, .bc_medbh15
1115         stx     %o4, [%o1-8]
1116         ba      .bc_medb7
1117         nop
1118         
1119         SET_SIZE(bcopy)
1120 /*
1121  * The _more entry points are not intended to be used directly by
1122  * any caller from outside this file.  They are provided to allow
1123  * profiling and dtrace of the portions of the copy code that uses
1124  * the floating point registers.
1125 */
1126         ENTRY(bcopy_more)
1127 .bcopy_more:
1128         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1129         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
1130         brz,pt  %o5, .do_copy
1131         nop
1132         sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
1133         or      %l7, %lo(.copyerr), %l7
1134         membar  #Sync                           ! sync error barrier
1135         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
1136         ! We've already captured whether t_lofault was zero on entry.
1137         ! We need to mark ourselves as being from bcopy since both
1138         ! kcopy and bcopy use the same code path. If LOFAULT_SET is
1139         ! set and the saved lofault was zero, we won't reset lofault on
1140         ! returning.
1141         or      %o5, LOFAULT_SET, %o5
1142 .do_copy:
1143         ldn     [THREAD_REG + T_LWP], %o3
1144         brnz,pt %o3, 1f
1145         nop
1146 /*
1147  * kpreempt_disable();
1148  */
1149         ldsb    [THREAD_REG +T_PREEMPT], %o3
1150         inc     %o3
1151         stb     %o3, [THREAD_REG + T_PREEMPT]
1152 1:
1153 /*
1154  * Following code is for large copies. We know there is at
1155  * least FP_COPY bytes available. FP regs are used, so
1156  *  we save registers and fp regs before starting
1157  */
1158         rd      %fprs, %g5              ! check for unused fp
1159         or      %o5,FPUSED_FLAG,%o5
1160         ! if fprs.fef == 0, set it.
1161         ! Setting it when already set costs more than checking
1162         andcc   %g5, FPRS_FEF, %g5      ! test FEF, fprs.du = fprs.dl = 0
1163         bz,pt   %ncc, .bc_fp_unused
1164         prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1165         BST_FP_TOSTACK(%o3)
1166         ba      .bc_fp_ready
1167 .bc_fp_unused:
1168         andcc   %i1, 1, %o3             ! is dest byte aligned
1169         wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
1170 .bc_fp_ready:
1171         rd      %gsr, %l5               ! save %gsr value
1172         bnz,pt  %ncc, .bc_big_d1
1173 .bc_big_d1f:                            ! dest is now half word aligned
1174         andcc   %i1, 2, %o3
1175         bnz,pt  %ncc, .bc_big_d2
1176 .bc_big_d2f:                            ! dest is now word aligned
1177         andcc   %i1, 4, %o3
1178         bnz,pt  %ncc, .bc_big_d4
1179 .bc_big_d4f:                            ! dest is now long word aligned
1180         andcc   %i0, 7, %o3             ! is src long word aligned
1181         brnz,pt %o3, .bc_big_unal8
1182         prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1183         
1184         ! Src and dst are long word aligned
1185         ! align dst to 64 byte boundary
1186         andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
1187         brz,pn  %o3, .bc_al_to_64
1188         nop
1189         sub     %o3, 64, %o3            ! %o3 has negative bytes to move
1190         add     %i2, %o3, %i2           ! adjust remaining count
1191         andcc   %o3, 8, %o4             ! odd long words to move?
1192         brz,pt  %o4, .bc_al_to_16
1193         nop
1194         add     %o3, 8, %o3
1195         ldx     [%i0], %o4
1196         add     %i0, 8, %i0             ! increment src ptr
1197         add     %i1, 8, %i1             ! increment dst ptr
1198         stx     %o4, [%i1-8]
1199 ! Dest is aligned on 16 bytes, src 8 byte aligned
1200 .bc_al_to_16:
1201         andcc   %o3, 0x30, %o4          ! pair of long words to move?
1202         brz,pt  %o4, .bc_al_to_64
1203         nop
1204 .bc_al_mv_16:
1205         add     %o3, 16, %o3
1206         ldx     [%i0], %o4
1207         stx     %o4, [%i1]
1208         ldx     [%i0+8], %o4
1209         add     %i0, 16, %i0            ! increment src ptr
1210         stx     %o4, [%i1+8]
1211         andcc   %o3, 48, %o4
1212         brnz,pt %o4, .bc_al_mv_16
1213         add     %i1, 16, %i1            ! increment dst ptr
1214 ! Dest is aligned on 64 bytes, src 8 byte aligned
1215 .bc_al_to_64:
1216         ! Determine source alignment
1217         ! to correct 8 byte offset
1218         andcc   %i0, 32, %o3
1219         brnz,pn %o3, .bc_aln_1
1220         andcc   %i0, 16, %o3
1221         brnz,pn %o3, .bc_aln_01
1222         andcc   %i0, 8, %o3
1223         brz,pn  %o3, .bc_aln_000
1224         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1225         ba      .bc_aln_001
1226         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1227 
1228 .bc_aln_01:
1229         brnz,pn %o3, .bc_aln_011
1230         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1231         ba      .bc_aln_010
1232         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1233 .bc_aln_1:
1234         andcc   %i0, 16, %o3
1235         brnz,pn %o3, .bc_aln_11
1236         andcc   %i0, 8, %o3
1237         brnz,pn %o3, .bc_aln_101
1238         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1239         ba      .bc_aln_100
1240         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1241 .bc_aln_11:
1242         brz,pn  %o3, .bc_aln_110
1243         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1244 
1245 .bc_aln_111:
1246 ! Alignment off by 8 bytes
1247         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1248         ldd     [%i0], %d0
1249         add     %i0, 8, %i0
1250         sub     %i2, 8, %i2
1251         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1252         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1253         sub     %i1, %i0, %i1
1254 .bc_aln_111_loop:
1255         ldda    [%i0]ASI_BLK_P,%d16             ! block load
1256         subcc   %o3, 64, %o3
1257         fmovd   %d16, %d2
1258         fmovd   %d18, %d4
1259         fmovd   %d20, %d6
1260         fmovd   %d22, %d8
1261         fmovd   %d24, %d10
1262         fmovd   %d26, %d12
1263         fmovd   %d28, %d14
1264         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1265         stda    %d0,[%i0+%i1]ASI_BLK_P
1266         add     %i0, 64, %i0
1267         fmovd   %d30, %d0
1268         bgt,pt  %ncc, .bc_aln_111_loop
1269         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1270         add     %i1, %i0, %i1
1271 
1272         std     %d0, [%i1]
1273         ba      .bc_remain_stuff
1274         add     %i1, 8, %i1
1275         ! END OF aln_111
1276 
1277 .bc_aln_110:
1278 ! Alignment off by 16 bytes
1279         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1280         ldd     [%i0], %d0
1281         ldd     [%i0+8], %d2
1282         add     %i0, 16, %i0
1283         sub     %i2, 16, %i2
1284         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1285         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1286         sub     %i1, %i0, %i1
1287 .bc_aln_110_loop:
1288         ldda    [%i0]ASI_BLK_P,%d16             ! block load
1289         subcc   %o3, 64, %o3
1290         fmovd   %d16, %d4
1291         fmovd   %d18, %d6
1292         fmovd   %d20, %d8
1293         fmovd   %d22, %d10
1294         fmovd   %d24, %d12
1295         fmovd   %d26, %d14
1296         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1297         stda    %d0,[%i0+%i1]ASI_BLK_P
1298         add     %i0, 64, %i0
1299         fmovd   %d28, %d0
1300         fmovd   %d30, %d2
1301         bgt,pt  %ncc, .bc_aln_110_loop
1302         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1303         add     %i1, %i0, %i1
1304 
1305         std     %d0, [%i1]
1306         std     %d2, [%i1+8]
1307         ba      .bc_remain_stuff
1308         add     %i1, 16, %i1
1309         ! END OF aln_110
1310 
1311 .bc_aln_101:
1312 ! Alignment off by 24 bytes
1313         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1314         ldd     [%i0], %d0
1315         ldd     [%i0+8], %d2
1316         ldd     [%i0+16], %d4
1317         add     %i0, 24, %i0
1318         sub     %i2, 24, %i2
1319         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1320         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1321         sub     %i1, %i0, %i1
1322 .bc_aln_101_loop:
1323         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1324         subcc   %o3, 64, %o3
1325         fmovd   %d16, %d6
1326         fmovd   %d18, %d8
1327         fmovd   %d20, %d10
1328         fmovd   %d22, %d12
1329         fmovd   %d24, %d14
1330         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1331         stda    %d0,[%i0+%i1]ASI_BLK_P
1332         add     %i0, 64, %i0
1333         fmovd   %d26, %d0
1334         fmovd   %d28, %d2
1335         fmovd   %d30, %d4
1336         bgt,pt  %ncc, .bc_aln_101_loop
1337         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1338         add     %i1, %i0, %i1
1339 
1340         std     %d0, [%i1]
1341         std     %d2, [%i1+8]
1342         std     %d4, [%i1+16]
1343         ba      .bc_remain_stuff
1344         add     %i1, 24, %i1
1345         ! END OF aln_101
1346 
1347 .bc_aln_100:
1348 ! Alignment off by 32 bytes
1349         ldd     [%i0], %d0
1350         ldd     [%i0+8], %d2
1351         ldd     [%i0+16],%d4
1352         ldd     [%i0+24],%d6
1353         add     %i0, 32, %i0
1354         sub     %i2, 32, %i2
1355         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1356         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1357         sub     %i1, %i0, %i1
1358 .bc_aln_100_loop:
1359         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1360         subcc   %o3, 64, %o3
1361         fmovd   %d16, %d8
1362         fmovd   %d18, %d10
1363         fmovd   %d20, %d12
1364         fmovd   %d22, %d14
1365         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1366         stda    %d0,[%i0+%i1]ASI_BLK_P
1367         add     %i0, 64, %i0
1368         fmovd   %d24, %d0
1369         fmovd   %d26, %d2
1370         fmovd   %d28, %d4
1371         fmovd   %d30, %d6
1372         bgt,pt  %ncc, .bc_aln_100_loop
1373         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1374         add     %i1, %i0, %i1
1375 
1376         std     %d0, [%i1]
1377         std     %d2, [%i1+8]
1378         std     %d4, [%i1+16]
1379         std     %d6, [%i1+24]
1380         ba      .bc_remain_stuff
1381         add     %i1, 32, %i1
1382         ! END OF aln_100
1383 
1384 .bc_aln_011:
1385 ! Alignment off by 40 bytes
1386         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1387         ldd     [%i0], %d0
1388         ldd     [%i0+8], %d2
1389         ldd     [%i0+16], %d4
1390         ldd     [%i0+24], %d6
1391         ldd     [%i0+32], %d8
1392         add     %i0, 40, %i0
1393         sub     %i2, 40, %i2
1394         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1395         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1396         sub     %i1, %i0, %i1
1397 .bc_aln_011_loop:
1398         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1399         subcc   %o3, 64, %o3
1400         fmovd   %d16, %d10
1401         fmovd   %d18, %d12
1402         fmovd   %d20, %d14
1403         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1404         stda    %d0,[%i0+%i1]ASI_BLK_P
1405         add     %i0, 64, %i0
1406         fmovd   %d22, %d0
1407         fmovd   %d24, %d2
1408         fmovd   %d26, %d4
1409         fmovd   %d28, %d6
1410         fmovd   %d30, %d8
1411         bgt,pt  %ncc, .bc_aln_011_loop
1412         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1413         add     %i1, %i0, %i1
1414 
1415         std     %d0, [%i1]
1416         std     %d2, [%i1+8]
1417         std     %d4, [%i1+16]
1418         std     %d6, [%i1+24]
1419         std     %d8, [%i1+32]
1420         ba      .bc_remain_stuff
1421         add     %i1, 40, %i1
1422         ! END OF aln_011
1423 
1424 .bc_aln_010:
1425 ! Alignment off by 48 bytes
1426         ldd     [%i0], %d0
1427         ldd     [%i0+8], %d2
1428         ldd     [%i0+16], %d4
1429         ldd     [%i0+24], %d6
1430         ldd     [%i0+32], %d8
1431         ldd     [%i0+40], %d10
1432         add     %i0, 48, %i0
1433         sub     %i2, 48, %i2
1434         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1435         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1436         sub     %i1, %i0, %i1
1437 .bc_aln_010_loop:
1438         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1439         subcc   %o3, 64, %o3
1440         fmovd   %d16, %d12
1441         fmovd   %d18, %d14
1442         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1443         stda    %d0,[%i0+%i1]ASI_BLK_P
1444         add     %i0, 64, %i0
1445         fmovd   %d20, %d0
1446         fmovd   %d22, %d2
1447         fmovd   %d24, %d4
1448         fmovd   %d26, %d6
1449         fmovd   %d28, %d8
1450         fmovd   %d30, %d10
1451         bgt,pt  %ncc, .bc_aln_010_loop
1452         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1453         add     %i1, %i0, %i1
1454 
1455         std     %d0, [%i1]
1456         std     %d2, [%i1+8]
1457         std     %d4, [%i1+16]
1458         std     %d6, [%i1+24]
1459         std     %d8, [%i1+32]
1460         std     %d10, [%i1+40]
1461         ba      .bc_remain_stuff
1462         add     %i1, 48, %i1
1463         ! END OF aln_010
1464 
1465 .bc_aln_001:
1466 ! Alignment off by 56 bytes
1467         ldd     [%i0], %d0
1468         ldd     [%i0+8], %d2
1469         ldd     [%i0+16], %d4
1470         ldd     [%i0+24], %d6
1471         ldd     [%i0+32], %d8
1472         ldd     [%i0+40], %d10
1473         ldd     [%i0+48], %d12
1474         add     %i0, 56, %i0
1475         sub     %i2, 56, %i2
1476         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1477         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1478         sub     %i1, %i0, %i1
1479 .bc_aln_001_loop:
1480         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1481         subcc   %o3, 64, %o3
1482         fmovd   %d16, %d14
1483         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1484         stda    %d0,[%i0+%i1]ASI_BLK_P
1485         add     %i0, 64, %i0
1486         fmovd   %d18, %d0
1487         fmovd   %d20, %d2
1488         fmovd   %d22, %d4
1489         fmovd   %d24, %d6
1490         fmovd   %d26, %d8
1491         fmovd   %d28, %d10
1492         fmovd   %d30, %d12
1493         bgt,pt  %ncc, .bc_aln_001_loop
1494         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1495         add     %i1, %i0, %i1
1496 
1497         std     %d0, [%i1]
1498         std     %d2, [%i1+8]
1499         std     %d4, [%i1+16]
1500         std     %d6, [%i1+24]
1501         std     %d8, [%i1+32]
1502         std     %d10, [%i1+40]
1503         std     %d12, [%i1+48]
1504         ba      .bc_remain_stuff
1505         add     %i1, 56, %i1
1506         ! END OF aln_001
1507 
1508 .bc_aln_000:
1509         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1510         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1511         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1512         sub     %i1, %i0, %i1
1513 .bc_aln_000_loop:
1514         ldda    [%i0]ASI_BLK_P,%d0
1515         subcc   %o3, 64, %o3
1516         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1517         stda    %d0,[%i0+%i1]ASI_BLK_P
1518         add     %i0, 64, %i0
1519         bgt,pt  %ncc, .bc_aln_000_loop
1520         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1521         add     %i1, %i0, %i1
1522 
1523         ! END OF aln_000
1524 
1525 .bc_remain_stuff:
1526         subcc   %i2, 31, %i2            ! adjust length to allow cc test
1527         ble,pt  %ncc, .bc_aln_31
1528         nop
1529 .bc_aln_32:
1530         ldx     [%i0], %o4              ! move 32 bytes
1531         subcc   %i2, 32, %i2            ! decrement length count by 32
1532         stx     %o4, [%i1]
1533         ldx     [%i0+8], %o4
1534         stx     %o4, [%i1+8]
1535         ldx     [%i0+16], %o4
1536         add     %i0, 32, %i0            ! increase src ptr by 32
1537         stx     %o4, [%i1+16]
1538         ldx     [%i0-8], %o4
1539         add     %i1, 32, %i1            ! increase dst ptr by 32
1540         bgu,pt  %ncc, .bc_aln_32        ! repeat if at least 32 bytes left
1541         stx     %o4, [%i1-8]
1542 .bc_aln_31:
1543         addcc   %i2, 24, %i2            ! adjust count to be off by 7
1544         ble,pt  %ncc, .bc_aln_7         ! skip if 7 or fewer bytes left
1545         nop                             !
1546 .bc_aln_15:
1547         ldx     [%i0], %o4              ! move 8 bytes
1548         add     %i0, 8, %i0             ! increase src ptr by 8
1549         subcc   %i2, 8, %i2             ! decrease count by 8
1550         add     %i1, 8, %i1             ! increase dst ptr by 8
1551         bgu,pt  %ncc, .bc_aln_15
1552         stx     %o4, [%i1-8]            !
1553 .bc_aln_7:
1554         addcc   %i2, 7, %i2             ! finish adjustment of remaining count
1555         bz,pt   %ncc, .bc_exit          ! exit if finished
1556         cmp     %i2, 4
1557         blt,pt  %ncc, .bc_unaln3x       ! skip if less than 4 bytes left
1558         nop                             !
1559         ld      [%i0], %o4              ! move 4 bytes
1560         add     %i0, 4, %i0             ! increase src ptr by 4
1561         add     %i1, 4, %i1             ! increase dst ptr by 4
1562         subcc   %i2, 4, %i2             ! decrease count by 4
1563         bnz     .bc_unaln3x
1564         stw     %o4, [%i1-4]
1565         ba      .bc_exit
1566         nop
1567 
1568         ! destination alignment code
1569 .bc_big_d1:
1570         ldub    [%i0], %o4              ! move a byte
1571         add     %i0, 1, %i0
1572         stb     %o4, [%i1]
1573         add     %i1, 1, %i1
1574         andcc   %i1, 2, %o3
1575         bz,pt   %ncc, .bc_big_d2f
1576         sub     %i2, 1, %i2
1577 .bc_big_d2:
1578         ldub    [%i0], %o4              ! move a half-word (src align unknown)
1579         ldub    [%i0+1], %o3
1580         add     %i0, 2, %i0
1581         sll     %o4, 8, %o4             ! position
1582         or      %o4, %o3, %o4           ! merge
1583         sth     %o4, [%i1]
1584         add     %i1, 2, %i1
1585         andcc   %i1, 4, %o3
1586         bz,pt   %ncc, .bc_big_d4f
1587         sub     %i2, 2, %i2
1588 .bc_big_d4:
1589         ldub    [%i0], %o4              ! move a word (src align unknown)
1590         ldub    [%i0+1], %o3
1591         sll     %o4, 24, %o4            ! position
1592         sll     %o3, 16, %o3            ! position
1593         or      %o4, %o3, %o3           ! merge
1594         ldub    [%i0+2], %o4
1595         sll     %o4, 8, %o4             ! position
1596         or      %o4, %o3, %o3           ! merge
1597         ldub    [%i0+3], %o4
1598         or      %o4, %o3, %o4           ! merge
1599         stw     %o4,[%i1]               ! store four bytes
1600         add     %i0, 4, %i0             ! adjust src by 4
1601         add     %i1, 4, %i1             ! adjust dest by 4
1602         ba      .bc_big_d4f
1603         sub     %i2, 4, %i2             ! adjust count by 4
1604 
1605 
1606         ! Dst is on 8 byte boundary; src is not;
1607 .bc_big_unal8:
1608         andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
1609         bz      %ncc, .bc_unalnsrc
1610         sub     %o3, 64, %o3            ! %o3 will be multiple of 8
1611         neg     %o3                     ! bytes until dest is 64 byte aligned
1612         sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
1613         ! Move bytes according to source alignment
1614         andcc   %i0, 0x1, %o4
1615         bnz     %ncc, .bc_unalnbyte     ! check for byte alignment
1616         nop
1617         andcc   %i0, 2, %o4             ! check for half word alignment
1618         bnz     %ncc, .bc_unalnhalf
1619         nop
1620         ! Src is word aligned, move bytes until dest 64 byte aligned
1621 .bc_unalnword:
1622         ld      [%i0], %o4              ! load 4 bytes
1623         stw     %o4, [%i1]              ! and store 4 bytes
1624         ld      [%i0+4], %o4            ! load 4 bytes
1625         add     %i0, 8, %i0             ! increase src ptr by 8
1626         stw     %o4, [%i1+4]            ! and store 4 bytes
1627         subcc   %o3, 8, %o3             ! decrease count by 8
1628         bnz     %ncc, .bc_unalnword
1629         add     %i1, 8, %i1             ! increase dst ptr by 8
1630         ba      .bc_unalnsrc
1631         nop
1632 
1633         ! Src is half-word aligned, move bytes until dest 64 byte aligned
1634 .bc_unalnhalf:
1635         lduh    [%i0], %o4              ! load 2 bytes
1636         sllx    %o4, 32, %i3            ! shift left
1637         lduw    [%i0+2], %o4
1638         or      %o4, %i3, %i3
1639         sllx    %i3, 16, %i3
1640         lduh    [%i0+6], %o4
1641         or      %o4, %i3, %i3
1642         stx     %i3, [%i1]
1643         add     %i0, 8, %i0
1644         subcc   %o3, 8, %o3
1645         bnz     %ncc, .bc_unalnhalf
1646         add     %i1, 8, %i1
1647         ba      .bc_unalnsrc
1648         nop
1649 
1650         ! Src is Byte aligned, move bytes until dest 64 byte aligned
1651 .bc_unalnbyte:
1652         sub     %i1, %i0, %i1           ! share pointer advance
1653 .bc_unalnbyte_loop:
1654         ldub    [%i0], %o4
1655         sllx    %o4, 56, %i3
1656         lduh    [%i0+1], %o4
1657         sllx    %o4, 40, %o4
1658         or      %o4, %i3, %i3
1659         lduh    [%i0+3], %o4
1660         sllx    %o4, 24, %o4
1661         or      %o4, %i3, %i3
1662         lduh    [%i0+5], %o4
1663         sllx    %o4, 8, %o4
1664         or      %o4, %i3, %i3
1665         ldub    [%i0+7], %o4
1666         or      %o4, %i3, %i3
1667         stx     %i3, [%i1+%i0]
1668         subcc   %o3, 8, %o3
1669         bnz     %ncc, .bc_unalnbyte_loop
1670         add     %i0, 8, %i0
1671         add     %i1,%i0, %i1            ! restore pointer
1672 
1673         ! Destination is now block (64 byte aligned), src is not 8 byte aligned
1674 .bc_unalnsrc:
1675         andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
1676         and     %i2, 0x3f, %i2          ! residue bytes in %i2
1677         add     %i2, 64, %i2            ! Insure we don't load beyond
1678         sub     %i3, 64, %i3            ! end of source buffer
1679 
1680         andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
1681         prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1682         alignaddr %i0, %g0, %g0         ! generate %gsr
1683         add     %i0, %i3, %i0           ! advance %i0 to after blocks
1684         !
1685         ! Determine source alignment to correct 8 byte offset
1686         andcc   %i0, 0x20, %o3
1687         brnz,pn %o3, .bc_unaln_1
1688         andcc   %i0, 0x10, %o3
1689         brnz,pn %o3, .bc_unaln_01
1690         andcc   %i0, 0x08, %o3
1691         brz,a   %o3, .bc_unaln_000
1692         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1693         ba      .bc_unaln_001
1694         nop
1695 .bc_unaln_01:
1696         brnz,a  %o3, .bc_unaln_011
1697         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1698         ba      .bc_unaln_010
1699         nop
1700 .bc_unaln_1:
1701         brnz,pn %o3, .bc_unaln_11
1702         andcc   %i0, 0x08, %o3
1703         brnz,a  %o3, .bc_unaln_101
1704         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1705         ba      .bc_unaln_100
1706         nop
1707 .bc_unaln_11:
1708         brz,pn  %o3, .bc_unaln_110
1709         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1710 
1711 .bc_unaln_111:
1712         ldd     [%o4+56], %d14
1713 .bc_unaln_111_loop:
1714         add     %o4, 64, %o4
1715         ldda    [%o4]ASI_BLK_P, %d16
1716         faligndata %d14, %d16, %d48
1717         faligndata %d16, %d18, %d50
1718         faligndata %d18, %d20, %d52
1719         faligndata %d20, %d22, %d54
1720         faligndata %d22, %d24, %d56
1721         faligndata %d24, %d26, %d58
1722         faligndata %d26, %d28, %d60
1723         faligndata %d28, %d30, %d62
1724         fmovd   %d30, %d14
1725         stda    %d48, [%i1]ASI_BLK_P
1726         subcc   %i3, 64, %i3
1727         add     %i1, 64, %i1
1728         bgu,pt  %ncc, .bc_unaln_111_loop
1729         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1730         ba      .bc_unaln_done
1731         nop
1732 
1733 .bc_unaln_110:
1734         ldd     [%o4+48], %d12
1735         ldd     [%o4+56], %d14
1736 .bc_unaln_110_loop:
1737         add     %o4, 64, %o4
1738         ldda    [%o4]ASI_BLK_P, %d16
1739         faligndata %d12, %d14, %d48
1740         faligndata %d14, %d16, %d50
1741         faligndata %d16, %d18, %d52
1742         faligndata %d18, %d20, %d54
1743         faligndata %d20, %d22, %d56
1744         faligndata %d22, %d24, %d58
1745         faligndata %d24, %d26, %d60
1746         faligndata %d26, %d28, %d62
1747         fmovd   %d28, %d12
1748         fmovd   %d30, %d14
1749         stda    %d48, [%i1]ASI_BLK_P
1750         subcc   %i3, 64, %i3
1751         add     %i1, 64, %i1
1752         bgu,pt  %ncc, .bc_unaln_110_loop
1753         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1754         ba      .bc_unaln_done
1755         nop
1756 
1757 .bc_unaln_101:
1758         ldd     [%o4+40], %d10
1759         ldd     [%o4+48], %d12
1760         ldd     [%o4+56], %d14
1761 .bc_unaln_101_loop:
1762         add     %o4, 64, %o4
1763         ldda    [%o4]ASI_BLK_P, %d16
1764         faligndata %d10, %d12, %d48
1765         faligndata %d12, %d14, %d50
1766         faligndata %d14, %d16, %d52
1767         faligndata %d16, %d18, %d54
1768         faligndata %d18, %d20, %d56
1769         faligndata %d20, %d22, %d58
1770         faligndata %d22, %d24, %d60
1771         faligndata %d24, %d26, %d62
1772         fmovd   %d26, %d10
1773         fmovd   %d28, %d12
1774         fmovd   %d30, %d14
1775         stda    %d48, [%i1]ASI_BLK_P
1776         subcc   %i3, 64, %i3
1777         add     %i1, 64, %i1
1778         bgu,pt  %ncc, .bc_unaln_101_loop
1779         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1780         ba      .bc_unaln_done
1781         nop
1782 
1783 .bc_unaln_100:
1784         ldd     [%o4+32], %d8
1785         ldd     [%o4+40], %d10
1786         ldd     [%o4+48], %d12
1787         ldd     [%o4+56], %d14
1788 .bc_unaln_100_loop:
1789         add     %o4, 64, %o4
1790         ldda    [%o4]ASI_BLK_P, %d16
1791         faligndata %d8, %d10, %d48
1792         faligndata %d10, %d12, %d50
1793         faligndata %d12, %d14, %d52
1794         faligndata %d14, %d16, %d54
1795         faligndata %d16, %d18, %d56
1796         faligndata %d18, %d20, %d58
1797         faligndata %d20, %d22, %d60
1798         faligndata %d22, %d24, %d62
1799         fmovd   %d24, %d8
1800         fmovd   %d26, %d10
1801         fmovd   %d28, %d12
1802         fmovd   %d30, %d14
1803         stda    %d48, [%i1]ASI_BLK_P
1804         subcc   %i3, 64, %i3
1805         add     %i1, 64, %i1
1806         bgu,pt  %ncc, .bc_unaln_100_loop
1807         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1808         ba      .bc_unaln_done
1809         nop
1810 
1811 .bc_unaln_011:
1812         ldd     [%o4+24], %d6
1813         ldd     [%o4+32], %d8
1814         ldd     [%o4+40], %d10
1815         ldd     [%o4+48], %d12
1816         ldd     [%o4+56], %d14
1817 .bc_unaln_011_loop:
1818         add     %o4, 64, %o4
1819         ldda    [%o4]ASI_BLK_P, %d16
1820         faligndata %d6, %d8, %d48
1821         faligndata %d8, %d10, %d50
1822         faligndata %d10, %d12, %d52
1823         faligndata %d12, %d14, %d54
1824         faligndata %d14, %d16, %d56
1825         faligndata %d16, %d18, %d58
1826         faligndata %d18, %d20, %d60
1827         faligndata %d20, %d22, %d62
1828         fmovd   %d22, %d6
1829         fmovd   %d24, %d8
1830         fmovd   %d26, %d10
1831         fmovd   %d28, %d12
1832         fmovd   %d30, %d14
1833         stda    %d48, [%i1]ASI_BLK_P
1834         subcc   %i3, 64, %i3
1835         add     %i1, 64, %i1
1836         bgu,pt  %ncc, .bc_unaln_011_loop
1837         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1838         ba      .bc_unaln_done
1839         nop
1840 
1841 .bc_unaln_010:
1842         ldd     [%o4+16], %d4
1843         ldd     [%o4+24], %d6
1844         ldd     [%o4+32], %d8
1845         ldd     [%o4+40], %d10
1846         ldd     [%o4+48], %d12
1847         ldd     [%o4+56], %d14
1848 .bc_unaln_010_loop:
1849         add     %o4, 64, %o4
1850         ldda    [%o4]ASI_BLK_P, %d16
1851         faligndata %d4, %d6, %d48
1852         faligndata %d6, %d8, %d50
1853         faligndata %d8, %d10, %d52
1854         faligndata %d10, %d12, %d54
1855         faligndata %d12, %d14, %d56
1856         faligndata %d14, %d16, %d58
1857         faligndata %d16, %d18, %d60
1858         faligndata %d18, %d20, %d62
1859         fmovd   %d20, %d4
1860         fmovd   %d22, %d6
1861         fmovd   %d24, %d8
1862         fmovd   %d26, %d10
1863         fmovd   %d28, %d12
1864         fmovd   %d30, %d14
1865         stda    %d48, [%i1]ASI_BLK_P
1866         subcc   %i3, 64, %i3
1867         add     %i1, 64, %i1
1868         bgu,pt  %ncc, .bc_unaln_010_loop
1869         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1870         ba      .bc_unaln_done
1871         nop
1872 
1873 .bc_unaln_001:
1874         ldd     [%o4+8], %d2
1875         ldd     [%o4+16], %d4
1876         ldd     [%o4+24], %d6
1877         ldd     [%o4+32], %d8
1878         ldd     [%o4+40], %d10
1879         ldd     [%o4+48], %d12
1880         ldd     [%o4+56], %d14
1881 .bc_unaln_001_loop:
1882         add     %o4, 64, %o4
1883         ldda    [%o4]ASI_BLK_P, %d16
1884         faligndata %d2, %d4, %d48
1885         faligndata %d4, %d6, %d50
1886         faligndata %d6, %d8, %d52
1887         faligndata %d8, %d10, %d54
1888         faligndata %d10, %d12, %d56
1889         faligndata %d12, %d14, %d58
1890         faligndata %d14, %d16, %d60
1891         faligndata %d16, %d18, %d62
1892         fmovd   %d18, %d2
1893         fmovd   %d20, %d4
1894         fmovd   %d22, %d6
1895         fmovd   %d24, %d8
1896         fmovd   %d26, %d10
1897         fmovd   %d28, %d12
1898         fmovd   %d30, %d14
1899         stda    %d48, [%i1]ASI_BLK_P
1900         subcc   %i3, 64, %i3
1901         add     %i1, 64, %i1
1902         bgu,pt  %ncc, .bc_unaln_001_loop
1903         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1904         ba      .bc_unaln_done
1905         nop
1906 
1907 .bc_unaln_000:
1908         ldda    [%o4]ASI_BLK_P, %d0
1909 .bc_unaln_000_loop:
1910         add     %o4, 64, %o4
1911         ldda    [%o4]ASI_BLK_P, %d16
1912         faligndata %d0, %d2, %d48
1913         faligndata %d2, %d4, %d50
1914         faligndata %d4, %d6, %d52
1915         faligndata %d6, %d8, %d54
1916         faligndata %d8, %d10, %d56
1917         faligndata %d10, %d12, %d58
1918         faligndata %d12, %d14, %d60
1919         faligndata %d14, %d16, %d62
1920         fmovd   %d16, %d0
1921         fmovd   %d18, %d2
1922         fmovd   %d20, %d4
1923         fmovd   %d22, %d6
1924         fmovd   %d24, %d8
1925         fmovd   %d26, %d10
1926         fmovd   %d28, %d12
1927         fmovd   %d30, %d14
1928         stda    %d48, [%i1]ASI_BLK_P
1929         subcc   %i3, 64, %i3
1930         add     %i1, 64, %i1
1931         bgu,pt  %ncc, .bc_unaln_000_loop
1932         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1933 
1934 .bc_unaln_done:
1935         ! Handle trailing bytes, 64 to 127
1936         ! Dest long word aligned, Src not long word aligned
1937         cmp     %i2, 15
1938         bleu    %ncc, .bc_unaln_short
1939 
1940         andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
1941         and     %i2, 0x7, %i2           ! residue bytes in %i2
1942         add     %i2, 8, %i2
1943         sub     %i3, 8, %i3             ! insure we don't load past end of src
1944         andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
1945         add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
1946         ldd     [%o4], %d0              ! fetch partial word
1947 .bc_unaln_by8:
1948         ldd     [%o4+8], %d2
1949         add     %o4, 8, %o4
1950         faligndata %d0, %d2, %d16
1951         subcc   %i3, 8, %i3
1952         std     %d16, [%i1]
1953         fmovd   %d2, %d0
1954         bgu,pt  %ncc, .bc_unaln_by8
1955         add     %i1, 8, %i1
1956 
1957 .bc_unaln_short:
1958         cmp     %i2, 8
1959         blt,pt  %ncc, .bc_unalnfin
1960         nop
1961         ldub    [%i0], %o4
1962         sll     %o4, 24, %o3
1963         ldub    [%i0+1], %o4
1964         sll     %o4, 16, %o4
1965         or      %o4, %o3, %o3
1966         ldub    [%i0+2], %o4
1967         sll     %o4, 8, %o4
1968         or      %o4, %o3, %o3
1969         ldub    [%i0+3], %o4
1970         or      %o4, %o3, %o3
1971         stw     %o3, [%i1]
1972         ldub    [%i0+4], %o4
1973         sll     %o4, 24, %o3
1974         ldub    [%i0+5], %o4
1975         sll     %o4, 16, %o4
1976         or      %o4, %o3, %o3
1977         ldub    [%i0+6], %o4
1978         sll     %o4, 8, %o4
1979         or      %o4, %o3, %o3
1980         ldub    [%i0+7], %o4
1981         or      %o4, %o3, %o3
1982         stw     %o3, [%i1+4]
1983         add     %i0, 8, %i0
1984         add     %i1, 8, %i1
1985         sub     %i2, 8, %i2
1986 .bc_unalnfin:
1987         cmp     %i2, 4
1988         blt,pt  %ncc, .bc_unalnz
1989         tst     %i2
1990         ldub    [%i0], %o3              ! read byte
1991         subcc   %i2, 4, %i2             ! reduce count by 4
1992         sll     %o3, 24, %o3            ! position
1993         ldub    [%i0+1], %o4
1994         sll     %o4, 16, %o4            ! position
1995         or      %o4, %o3, %o3           ! merge
1996         ldub    [%i0+2], %o4
1997         sll     %o4, 8, %o4             ! position
1998         or      %o4, %o3, %o3           ! merge
1999         add     %i1, 4, %i1             ! advance dst by 4
2000         ldub    [%i0+3], %o4
2001         add     %i0, 4, %i0             ! advance src by 4
2002         or      %o4, %o3, %o4           ! merge
2003         bnz,pt  %ncc, .bc_unaln3x
2004         stw     %o4, [%i1-4]
2005         ba      .bc_exit
2006         nop
2007 .bc_unalnz:
2008         bz,pt   %ncc, .bc_exit
2009 .bc_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
2010         subcc   %i2, 1, %i2             ! reduce count for cc test
2011         ldub    [%i0], %o4              ! load one byte
2012         bz,pt   %ncc, .bc_exit
2013         stb     %o4, [%i1]              ! store one byte
2014         ldub    [%i0+1], %o4            ! load second byte
2015         subcc   %i2, 1, %i2
2016         bz,pt   %ncc, .bc_exit
2017         stb     %o4, [%i1+1]            ! store second byte
2018         ldub    [%i0+2], %o4            ! load third byte
2019         stb     %o4, [%i1+2]            ! store third byte
2020 .bc_exit:
2021         wr      %l5, %g0, %gsr          ! restore %gsr
2022         brnz    %g5, .bc_fp_restore
2023         and     %o5, COPY_FLAGS, %l1    ! save flags in %l1
2024         FZERO
2025         wr      %g5, %g0, %fprs
2026         ba,pt   %ncc, .bc_ex2
2027         nop
2028 .bc_fp_restore:
2029         BLD_FP_FROMSTACK(%o4)
2030 .bc_ex2:
2031         ldn     [THREAD_REG + T_LWP], %o2
2032         brnz,pt %o2, 1f
2033         nop
2034 
2035         ldsb    [THREAD_REG + T_PREEMPT], %l0
2036         deccc   %l0
2037         bnz,pn  %ncc, 1f
2038         stb     %l0, [THREAD_REG + T_PREEMPT]
2039 
2040         ! Check for a kernel preemption request
2041         ldn     [THREAD_REG + T_CPU], %l0
2042         ldub    [%l0 + CPU_KPRUNRUN], %l0
2043         brnz,a,pt       %l0, 1f ! Need to call kpreempt?
2044         or      %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
2045 1:
2046         btst    LOFAULT_SET, %l1
2047         bz,pn   %icc, 3f
2048         andncc  %o5, COPY_FLAGS, %o5
2049         ! Here via bcopy. Check to see if the handler was NULL.
2050         ! If so, just return quietly. Otherwise, reset the
2051         ! handler and return.
2052         bz,pn %ncc, 2f
2053         nop
2054         membar  #Sync
2055         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2056 2:
2057         btst    KPREEMPT_FLAG, %l1
2058         bz,pt   %icc, 3f
2059         nop
2060         call    kpreempt
2061         rdpr    %pil, %o0               ! pass %pil
2062 3:
2063         ret
2064         restore %g0, 0, %o0
2065         
2066         SET_SIZE(bcopy_more)
2067 
2068 
2069 #else   /* NIAGARA_IMPL */
2070         save    %sp, -SA(MINFRAME), %sp
2071         clr     %o5                     ! flag LOFAULT_SET is not set for bcopy
2072 .do_copy:
2073         cmp     %i2, 12                 ! for small counts
2074         blu     %ncc, .bytecp           ! just copy bytes
2075         .empty
2076 
2077         cmp     %i2, 128                ! for less than 128 bytes
2078         blu,pn  %ncc, .bcb_punt         ! no block st/quad ld
2079         nop
2080 
2081         set     use_hw_bcopy, %o2
2082         ld      [%o2], %o2
2083         brz,pn  %o2, .bcb_punt
2084         nop
2085 
2086         subcc   %i1, %i0, %i3
2087         bneg,a,pn %ncc, 1f
2088         neg     %i3
2089 1:
2090         /*
2091          * Compare against 256 since we should be checking block addresses
2092          * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2093          * src = dest + (64 * 3) + 63.
2094          */
2095         cmp     %i3, 256
2096         blu,pn  %ncc, .bcb_punt
2097         nop
2098 
2099         /*
2100          * Copy that reach here have at least 2 blocks of data to copy.
2101          */
2102 .do_blockcopy:
2103         ! Swap src/dst since the code below is memcpy code
2104         ! and memcpy/bcopy have different calling sequences
2105         mov     %i1, %i5
2106         mov     %i0, %i1
2107         mov     %i5, %i0
2108 
2109         ! Block (64 bytes) align the destination.
2110         andcc   %i0, 0x3f, %i3          ! is dst aligned on a 64 bytes
2111         bz      %xcc, .chksrc           ! dst is already double aligned
2112         sub     %i3, 0x40, %i3
2113         neg     %i3                     ! bytes till dst 64 bytes aligned
2114         sub     %i2, %i3, %i2           ! update i2 with new count
2115 
2116         ! Based on source and destination alignment do
2117         ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2118 
2119         ! Is dst & src 8B aligned
2120         or      %i0, %i1, %o2
2121         andcc   %o2, 0x7, %g0
2122         bz      %ncc, .alewdcp
2123         nop
2124 
2125         ! Is dst & src 4B aligned
2126         andcc   %o2, 0x3, %g0
2127         bz      %ncc, .alwdcp
2128         nop
2129 
2130         ! Is dst & src 2B aligned
2131         andcc   %o2, 0x1, %g0
2132         bz      %ncc, .alhlfwdcp
2133         nop
2134 
2135         ! 1B aligned
2136 1:      ldub    [%i1], %o2
2137         stb     %o2, [%i0]
2138         inc     %i1
2139         deccc   %i3
2140         bgu,pt  %ncc, 1b
2141         inc     %i0
2142 
2143         ba      .chksrc
2144         nop
2145 
2146         ! dst & src 4B aligned
2147 .alwdcp:
2148         ld      [%i1], %o2
2149         st      %o2, [%i0]
2150         add     %i1, 0x4, %i1
2151         subcc   %i3, 0x4, %i3
2152         bgu,pt  %ncc, .alwdcp
2153         add     %i0, 0x4, %i0
2154 
2155         ba      .chksrc
2156         nop
2157 
2158         ! dst & src 2B aligned
2159 .alhlfwdcp:
2160         lduh    [%i1], %o2
2161         stuh    %o2, [%i0]
2162         add     %i1, 0x2, %i1
2163         subcc   %i3, 0x2, %i3
2164         bgu,pt  %ncc, .alhlfwdcp
2165         add     %i0, 0x2, %i0
2166 
2167         ba      .chksrc
2168         nop
2169 
2170         ! dst & src 8B aligned
2171 .alewdcp:
2172         ldx     [%i1], %o2
2173         stx     %o2, [%i0]
2174         add     %i1, 0x8, %i1
2175         subcc   %i3, 0x8, %i3
2176         bgu,pt  %ncc, .alewdcp
2177         add     %i0, 0x8, %i0
2178 
2179         ! Now Destination is block (64 bytes) aligned
2180 .chksrc:
2181         andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
2182         sub     %i2, %i3, %i2           ! Residue bytes in %i2
2183 
2184         mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2185 
2186         andcc   %i1, 0xf, %o2           ! is src quadword aligned
2187         bz,pn   %xcc, .blkcpy           ! src offset in %o2
2188         nop
2189         cmp     %o2, 0x8
2190         bg      .cpy_upper_double
2191         nop
2192         bl      .cpy_lower_double
2193         nop
2194 
2195         ! Falls through when source offset is equal to 8 i.e.
2196         ! source is double word aligned.
2197         ! In this case no shift/merge of data is required
2198         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
2199         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
2200         prefetch [%l0+0x0], #one_read
2201         ldda    [%i1+0x0]%asi, %l2
2202 loop0:
2203         ldda    [%i1+0x10]%asi, %l4
2204         prefetch [%l0+0x40], #one_read
2205 
2206         stxa    %l3, [%i0+0x0]%asi
2207         stxa    %l4, [%i0+0x8]%asi
2208 
2209         ldda    [%i1+0x20]%asi, %l2
2210         stxa    %l5, [%i0+0x10]%asi
2211         stxa    %l2, [%i0+0x18]%asi
2212 
2213         ldda    [%i1+0x30]%asi, %l4
2214         stxa    %l3, [%i0+0x20]%asi
2215         stxa    %l4, [%i0+0x28]%asi
2216 
2217         ldda    [%i1+0x40]%asi, %l2
2218         stxa    %l5, [%i0+0x30]%asi
2219         stxa    %l2, [%i0+0x38]%asi
2220 
2221         add     %l0, 0x40, %l0
2222         add     %i1, 0x40, %i1
2223         subcc   %i3, 0x40, %i3
2224         bgu,pt  %xcc, loop0
2225         add     %i0, 0x40, %i0
2226         ba      .blkdone
2227         add     %i1, %o2, %i1           ! increment the source by src offset
2228                                         ! the src offset was stored in %o2
2229 
2230 .cpy_lower_double:
2231         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
2232         sll     %o2, 3, %o0             ! %o0 left shift
2233         mov     0x40, %o1
2234         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
2235         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
2236         prefetch [%l0+0x0], #one_read
2237         ldda    [%i1+0x0]%asi, %l2      ! partial data in %l2 and %l3 has
2238                                         ! complete data
2239 loop1:
2240         ldda    [%i1+0x10]%asi, %l4     ! %l4 has partial data for this read.
2241         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
2242                                                         ! into %l2 and %l3
2243         prefetch [%l0+0x40], #one_read
2244         stxa    %l2, [%i0+0x0]%asi
2245         stxa    %l3, [%i0+0x8]%asi
2246 
2247         ldda    [%i1+0x20]%asi, %l2
2248         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
2249         stxa    %l4, [%i0+0x10]%asi                     ! %l4 from previous read
2250         stxa    %l5, [%i0+0x18]%asi                     ! into %l4 and %l5
2251 
2252         ! Repeat the same for next 32 bytes.
2253 
2254         ldda    [%i1+0x30]%asi, %l4
2255         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2256         stxa    %l2, [%i0+0x20]%asi
2257         stxa    %l3, [%i0+0x28]%asi
2258 
2259         ldda    [%i1+0x40]%asi, %l2
2260         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2261         stxa    %l4, [%i0+0x30]%asi
2262         stxa    %l5, [%i0+0x38]%asi
2263 
2264         add     %l0, 0x40, %l0
2265         add     %i1, 0x40, %i1
2266         subcc   %i3, 0x40, %i3
2267         bgu,pt  %xcc, loop1
2268         add     %i0, 0x40, %i0
2269         ba      .blkdone
2270         add     %i1, %o2, %i1           ! increment the source by src offset
2271                                         ! the src offset was stored in %o2
2272 
2273 .cpy_upper_double:
2274         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
2275         mov     0x8, %o0
2276         sub     %o2, %o0, %o0
2277         sll     %o0, 3, %o0             ! %o0 left shift
2278         mov     0x40, %o1
2279         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
2280         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
2281         prefetch [%l0+0x0], #one_read
2282         ldda    [%i1+0x0]%asi, %l2      ! partial data in %l3 for this read and
2283                                         ! no data in %l2
2284 loop2:
2285         ldda    [%i1+0x10]%asi, %l4     ! %l4 has complete data and %l5 has
2286                                         ! partial
2287         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
2288                                                         ! into %l3 and %l4
2289         prefetch [%l0+0x40], #one_read
2290         stxa    %l3, [%i0+0x0]%asi
2291         stxa    %l4, [%i0+0x8]%asi
2292 
2293         ldda    [%i1+0x20]%asi, %l2
2294         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
2295         stxa    %l5, [%i0+0x10]%asi                     ! %l5 from previous read
2296         stxa    %l2, [%i0+0x18]%asi                     ! into %l5 and %l2
2297 
2298         ! Repeat the same for next 32 bytes.
2299 
2300         ldda    [%i1+0x30]%asi, %l4
2301         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2302         stxa    %l3, [%i0+0x20]%asi
2303         stxa    %l4, [%i0+0x28]%asi
2304 
2305         ldda    [%i1+0x40]%asi, %l2
2306         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2307         stxa    %l5, [%i0+0x30]%asi
2308         stxa    %l2, [%i0+0x38]%asi
2309 
2310         add     %l0, 0x40, %l0
2311         add     %i1, 0x40, %i1
2312         subcc   %i3, 0x40, %i3
2313         bgu,pt  %xcc, loop2
2314         add     %i0, 0x40, %i0
2315         ba      .blkdone
2316         add     %i1, %o2, %i1           ! increment the source by src offset
2317                                         ! the src offset was stored in %o2
2318 
2319 
2320         ! Both Source and Destination are block aligned.
2321         ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2322 .blkcpy:
2323         prefetch [%i1+0x0], #one_read
2324 1:
2325         ldda    [%i1+0x0]%asi, %l0
2326         ldda    [%i1+0x10]%asi, %l2
2327         prefetch [%i1+0x40], #one_read
2328 
2329         stxa    %l0, [%i0+0x0]%asi
2330         ldda    [%i1+0x20]%asi, %l4
2331         ldda    [%i1+0x30]%asi, %l6
2332 
2333         stxa    %l1, [%i0+0x8]%asi
2334         stxa    %l2, [%i0+0x10]%asi
2335         stxa    %l3, [%i0+0x18]%asi
2336         stxa    %l4, [%i0+0x20]%asi
2337         stxa    %l5, [%i0+0x28]%asi
2338         stxa    %l6, [%i0+0x30]%asi
2339         stxa    %l7, [%i0+0x38]%asi
2340 
2341         add     %i1, 0x40, %i1
2342         subcc   %i3, 0x40, %i3
2343         bgu,pt  %xcc, 1b
2344         add     %i0, 0x40, %i0
2345 
2346 .blkdone:
2347         membar  #Sync
2348 
2349         brz,pt  %i2, .blkexit
2350         nop
2351 
2352         ! Handle trailing bytes
2353         cmp     %i2, 0x8
2354         blu,pt  %ncc, .residue
2355         nop
2356 
2357         ! Can we do some 8B ops
2358         or      %i1, %i0, %o2
2359         andcc   %o2, 0x7, %g0
2360         bnz     %ncc, .last4
2361         nop
2362 
2363         ! Do 8byte ops as long as possible
2364 .last8:
2365         ldx     [%i1], %o2
2366         stx     %o2, [%i0]
2367         add     %i1, 0x8, %i1
2368         sub     %i2, 0x8, %i2
2369         cmp     %i2, 0x8
2370         bgu,pt  %ncc, .last8
2371         add     %i0, 0x8, %i0
2372 
2373         brz,pt  %i2, .blkexit
2374         nop
2375 
2376         ba      .residue
2377         nop
2378 
2379 .last4:
2380         ! Can we do 4B ops
2381         andcc   %o2, 0x3, %g0
2382         bnz     %ncc, .last2
2383         nop
2384 1:
2385         ld      [%i1], %o2
2386         st      %o2, [%i0]
2387         add     %i1, 0x4, %i1
2388         sub     %i2, 0x4, %i2
2389         cmp     %i2, 0x4
2390         bgu,pt  %ncc, 1b
2391         add     %i0, 0x4, %i0
2392 
2393         brz,pt  %i2, .blkexit
2394         nop
2395 
2396         ba      .residue
2397         nop
2398 
2399 .last2:
2400         ! Can we do 2B ops
2401         andcc   %o2, 0x1, %g0
2402         bnz     %ncc, .residue
2403         nop
2404 
2405 1:
2406         lduh    [%i1], %o2
2407         stuh    %o2, [%i0]
2408         add     %i1, 0x2, %i1
2409         sub     %i2, 0x2, %i2
2410         cmp     %i2, 0x2
2411         bgu,pt  %ncc, 1b
2412         add     %i0, 0x2, %i0
2413 
2414         brz,pt  %i2, .blkexit
2415         nop
2416 
2417 .residue:
2418         ldub    [%i1], %o2
2419         stb     %o2, [%i0]
2420         inc     %i1
2421         deccc   %i2
2422         bgu,pt  %ncc, .residue
2423         inc     %i0
2424 
2425 .blkexit:
2426 
2427         membar  #Sync                           ! sync error barrier
2428         ! Restore t_lofault handler, if came here from kcopy().
2429         tst     %o5
2430         bz      %ncc, 1f
2431         andn    %o5, LOFAULT_SET, %o5
2432         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2433 1:
2434         ret
2435         restore %g0, 0, %o0
2436 
2437 
2438 .bcb_punt:
2439         !
2440         ! use aligned transfers where possible
2441         !
2442         xor     %i0, %i1, %o4           ! xor from and to address
2443         btst    7, %o4                  ! if lower three bits zero
2444         bz      .aldoubcp               ! can align on double boundary
2445         .empty  ! assembler complaints about label
2446 
2447         xor     %i0, %i1, %o4           ! xor from and to address
2448         btst    3, %o4                  ! if lower two bits zero
2449         bz      .alwordcp               ! can align on word boundary
2450         btst    3, %i0                  ! delay slot, from address unaligned?
2451         !
2452         ! use aligned reads and writes where possible
2453         ! this differs from wordcp in that it copes
2454         ! with odd alignment between source and destnation
2455         ! using word reads and writes with the proper shifts
2456         ! in between to align transfers to and from memory
2457         ! i0 - src address, i1 - dest address, i2 - count
2458         ! i3, i4 - tmps for used generating complete word
2459         ! i5 (word to write)
2460         ! l0 size in bits of upper part of source word (US)
2461         ! l1 size in bits of lower part of source word (LS = 32 - US)
2462         ! l2 size in bits of upper part of destination word (UD)
2463         ! l3 size in bits of lower part of destination word (LD = 32 - UD)
2464         ! l4 number of bytes leftover after aligned transfers complete
2465         ! l5 the number 32
2466         !
2467         mov     32, %l5                 ! load an oft-needed constant
2468         bz      .align_dst_only
2469         btst    3, %i1                  ! is destnation address aligned?
2470         clr     %i4                     ! clear registers used in either case
2471         bz      .align_src_only
2472         clr     %l0
2473         !
2474         ! both source and destination addresses are unaligned
2475         !
2476 1:                                      ! align source
2477         ldub    [%i0], %i3              ! read a byte from source address
2478         add     %i0, 1, %i0             ! increment source address
2479         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
2480         btst    3, %i0                  ! is source aligned?
2481         add     %l0, 8, %l0             ! increment size of upper source (US)
2482         bnz,a   1b
2483         sll     %i4, 8, %i4             ! make room for next byte
2484 
2485         sub     %l5, %l0, %l1           ! generate shift left count (LS)
2486         sll     %i4, %l1, %i4           ! prepare to get rest
2487         ld      [%i0], %i3              ! read a word
2488         add     %i0, 4, %i0             ! increment source address
2489         srl     %i3, %l0, %i5           ! upper src bits into lower dst bits
2490         or      %i4, %i5, %i5           ! merge
2491         mov     24, %l3                 ! align destination
2492 1:
2493         srl     %i5, %l3, %i4           ! prepare to write a single byte
2494         stb     %i4, [%i1]              ! write a byte
2495         add     %i1, 1, %i1             ! increment destination address
2496         sub     %i2, 1, %i2             ! decrement count
2497         btst    3, %i1                  ! is destination aligned?
2498         bnz,a   1b
2499         sub     %l3, 8, %l3             ! delay slot, decrement shift count (LD)
2500         sub     %l5, %l3, %l2           ! generate shift left count (UD)
2501         sll     %i5, %l2, %i5           ! move leftover into upper bytes
2502         cmp     %l2, %l0                ! cmp # reqd to fill dst w old src left
2503         bgu     %ncc, .more_needed      ! need more to fill than we have
2504         nop
2505 
2506         sll     %i3, %l1, %i3           ! clear upper used byte(s)
2507         srl     %i3, %l1, %i3
2508         ! get the odd bytes between alignments
2509         sub     %l0, %l2, %l0           ! regenerate shift count
2510         sub     %l5, %l0, %l1           ! generate new shift left count (LS)
2511         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
2512         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
2513         srl     %i3, %l0, %i4
2514         or      %i5, %i4, %i5
2515         st      %i5, [%i1]              ! write a word
2516         subcc   %i2, 4, %i2             ! decrement count
2517         bz      %ncc, .unalign_out
2518         add     %i1, 4, %i1             ! increment destination address
2519 
2520         b       2f
2521         sll     %i3, %l1, %i5           ! get leftover into upper bits
2522 .more_needed:
2523         sll     %i3, %l0, %i3           ! save remaining byte(s)
2524         srl     %i3, %l0, %i3
2525         sub     %l2, %l0, %l1           ! regenerate shift count
2526         sub     %l5, %l1, %l0           ! generate new shift left count
2527         sll     %i3, %l1, %i4           ! move to fill empty space
2528         b       3f
2529         or      %i5, %i4, %i5           ! merge to complete word
2530         !
2531         ! the source address is aligned and destination is not
2532         !
2533 .align_dst_only:
2534         ld      [%i0], %i4              ! read a word
2535         add     %i0, 4, %i0             ! increment source address
2536         mov     24, %l0                 ! initial shift alignment count
2537 1:
2538         srl     %i4, %l0, %i3           ! prepare to write a single byte
2539         stb     %i3, [%i1]              ! write a byte
2540         add     %i1, 1, %i1             ! increment destination address
2541         sub     %i2, 1, %i2             ! decrement count
2542         btst    3, %i1                  ! is destination aligned?
2543         bnz,a   1b
2544         sub     %l0, 8, %l0             ! delay slot, decrement shift count
2545 .xfer:
2546         sub     %l5, %l0, %l1           ! generate shift left count
2547         sll     %i4, %l1, %i5           ! get leftover
2548 3:
2549         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
2550         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
2551 2:
2552         ld      [%i0], %i3              ! read a source word
2553         add     %i0, 4, %i0             ! increment source address
2554         srl     %i3, %l0, %i4           ! upper src bits into lower dst bits
2555         or      %i5, %i4, %i5           ! merge with upper dest bits (leftover)
2556         st      %i5, [%i1]              ! write a destination word
2557         subcc   %i2, 4, %i2             ! decrement count
2558         bz      %ncc, .unalign_out      ! check if done
2559         add     %i1, 4, %i1             ! increment destination address
2560         b       2b                      ! loop
2561         sll     %i3, %l1, %i5           ! get leftover
2562 .unalign_out:
2563         tst     %l4                     ! any bytes leftover?
2564         bz      %ncc, .cpdone
2565         .empty                          ! allow next instruction in delay slot
2566 1:
2567         sub     %l0, 8, %l0             ! decrement shift
2568         srl     %i3, %l0, %i4           ! upper src byte into lower dst byte
2569         stb     %i4, [%i1]              ! write a byte
2570         subcc   %l4, 1, %l4             ! decrement count
2571         bz      %ncc, .cpdone           ! done?
2572         add     %i1, 1, %i1             ! increment destination
2573         tst     %l0                     ! any more previously read bytes
2574         bnz     %ncc, 1b                ! we have leftover bytes
2575         mov     %l4, %i2                ! delay slot, mv cnt where dbytecp wants
2576         b       .dbytecp                ! let dbytecp do the rest
2577         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
2578         !
2579         ! the destination address is aligned and the source is not
2580         !
2581 .align_src_only:
2582         ldub    [%i0], %i3              ! read a byte from source address
2583         add     %i0, 1, %i0             ! increment source address
2584         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
2585         btst    3, %i0                  ! is source aligned?
2586         add     %l0, 8, %l0             ! increment shift count (US)
2587         bnz,a   .align_src_only
2588         sll     %i4, 8, %i4             ! make room for next byte
2589         b,a     .xfer
2590         !
2591         ! if from address unaligned for double-word moves,
2592         ! move bytes till it is, if count is < 56 it could take
2593         ! longer to align the thing than to do the transfer
2594         ! in word size chunks right away
2595         !
2596 .aldoubcp:
2597         cmp     %i2, 56                 ! if count < 56, use wordcp, it takes
2598         blu,a   %ncc, .alwordcp         ! longer to align doubles than words
2599         mov     3, %o0                  ! mask for word alignment
2600         call    .alignit                ! copy bytes until aligned
2601         mov     7, %o0                  ! mask for double alignment
2602         !
2603         ! source and destination are now double-word aligned
2604         ! i3 has aligned count returned by alignit
2605         !
2606         and     %i2, 7, %i2             ! unaligned leftover count
2607         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
2608 5:
2609         ldx     [%i0+%i1], %o4          ! read from address
2610         stx     %o4, [%i1]              ! write at destination address
2611         subcc   %i3, 8, %i3             ! dec count
2612         bgu     %ncc, 5b
2613         add     %i1, 8, %i1             ! delay slot, inc to address
2614         cmp     %i2, 4                  ! see if we can copy a word
2615         blu     %ncc, .dbytecp          ! if 3 or less bytes use bytecp
2616         .empty
2617         !
2618         ! for leftover bytes we fall into wordcp, if needed
2619         !
2620 .wordcp:
2621         and     %i2, 3, %i2             ! unaligned leftover count
2622 5:
2623         ld      [%i0+%i1], %o4          ! read from address
2624         st      %o4, [%i1]              ! write at destination address
2625         subcc   %i3, 4, %i3             ! dec count
2626         bgu     %ncc, 5b
2627         add     %i1, 4, %i1             ! delay slot, inc to address
2628         b,a     .dbytecp
2629 
2630         ! we come here to align copies on word boundaries
2631 .alwordcp:
2632         call    .alignit                ! go word-align it
2633         mov     3, %o0                  ! bits that must be zero to be aligned
2634         b       .wordcp
2635         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
2636 
2637         !
2638         ! byte copy, works with any alignment
2639         !
2640 .bytecp:
2641         b       .dbytecp
2642         sub     %i0, %i1, %i0           ! i0 gets difference of src and dst
2643 
2644         !
2645         ! differenced byte copy, works with any alignment
2646         ! assumes dest in %i1 and (source - dest) in %i0
2647         !
2648 1:
2649         stb     %o4, [%i1]              ! write to address
2650         inc     %i1                     ! inc to address
2651 .dbytecp:
2652         deccc   %i2                     ! dec count
2653         bgeu,a  %ncc, 1b                ! loop till done
2654         ldub    [%i0+%i1], %o4          ! read from address
2655 .cpdone:
2656 
2657         membar  #Sync                           ! sync error barrier
2658         ! Restore t_lofault handler, if came here from kcopy().
2659         tst     %o5
2660         bz      %ncc, 1f
2661         andn    %o5, LOFAULT_SET, %o5
2662         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2663 1:
2664         ret
2665         restore %g0, 0, %o0             ! return (0)
2666 
2667 /*
2668  * Common code used to align transfers on word and doubleword
2669  * boundaries.  Aligns source and destination and returns a count
2670  * of aligned bytes to transfer in %i3
2671  */
2672 1:
2673         inc     %i0                     ! inc from
2674         stb     %o4, [%i1]              ! write a byte
2675         inc     %i1                     ! inc to
2676         dec     %i2                     ! dec count
2677 .alignit:
2678         btst    %o0, %i0                ! %o0 is bit mask to check for alignment
2679         bnz,a   1b
2680         ldub    [%i0], %o4              ! read next byte
2681 
2682         retl
2683         andn    %i2, %o0, %i3           ! return size of aligned bytes
2684         
2685         SET_SIZE(bcopy)
2686 
2687 #endif  /* NIAGARA_IMPL */
2688 
2689 #endif  /* lint */
2690 
2691 /*
2692  * Block copy with possibly overlapped operands.
2693  */
2694 
2695 #if defined(lint)
2696 
2697 /*ARGSUSED*/
2698 void
2699 ovbcopy(const void *from, void *to, size_t count)
2700 {}
2701 
2702 #else   /* lint */
2703 
2704         ENTRY(ovbcopy)
2705         tst     %o2                     ! check count
2706         bgu,a   %ncc, 1f                ! nothing to do or bad arguments
2707         subcc   %o0, %o1, %o3           ! difference of from and to address
2708 
2709         retl                            ! return
2710         nop
2711 1:
2712         bneg,a  %ncc, 2f
2713         neg     %o3                     ! if < 0, make it positive
2714 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
2715         bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
2716         .empty                          !   no overlap
2717         cmp     %o0, %o1                ! compare from and to addresses
2718         blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
2719         nop
2720         !
2721         ! Copy forwards.
2722         !
2723 .ov_fwd:
2724         ldub    [%o0], %o3              ! read from address
2725         inc     %o0                     ! inc from address
2726         stb     %o3, [%o1]              ! write to address
2727         deccc   %o2                     ! dec count
2728         bgu     %ncc, .ov_fwd           ! loop till done
2729         inc     %o1                     ! inc to address
2730 
2731         retl                            ! return
2732         nop
2733         !
2734         ! Copy backwards.
2735         !
2736 .ov_bkwd:
2737         deccc   %o2                     ! dec count
2738         ldub    [%o0 + %o2], %o3        ! get byte at end of src
2739         bgu     %ncc, .ov_bkwd          ! loop till done
2740         stb     %o3, [%o1 + %o2]        ! delay slot, store at end of dst
2741 
2742         retl                            ! return
2743         nop
2744         SET_SIZE(ovbcopy)
2745 
2746 #endif  /* lint */
2747 
2748 /*
2749  * hwblkpagecopy()
2750  *
2751  * Copies exactly one page.  This routine assumes the caller (ppcopy)
2752  * has already disabled kernel preemption and has checked
2753  * use_hw_bcopy.
2754  */
2755 #ifdef lint
2756 /*ARGSUSED*/
2757 void
2758 hwblkpagecopy(const void *src, void *dst)
2759 { }
2760 #else /* lint */
2761         ENTRY(hwblkpagecopy)
2762         save    %sp, -SA(MINFRAME), %sp
2763 
2764         ! %i0 - source address (arg)
2765         ! %i1 - destination address (arg)
2766         ! %i2 - length of region (not arg)
2767 
2768         set     PAGESIZE, %i2
2769 
2770         /*
2771          * Copying exactly one page and PAGESIZE is in mutliple of 0x80. 
2772          */
2773         mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2774         prefetch [%i0+0x0], #one_read
2775         prefetch [%i0+0x40], #one_read
2776 1:
2777         prefetch [%i0+0x80], #one_read
2778         prefetch [%i0+0xc0], #one_read
2779         ldda    [%i0+0x0]%asi, %l0
2780         ldda    [%i0+0x10]%asi, %l2
2781         ldda    [%i0+0x20]%asi, %l4
2782         ldda    [%i0+0x30]%asi, %l6
2783         stxa    %l0, [%i1+0x0]%asi
2784         stxa    %l1, [%i1+0x8]%asi
2785         stxa    %l2, [%i1+0x10]%asi
2786         stxa    %l3, [%i1+0x18]%asi
2787         stxa    %l4, [%i1+0x20]%asi
2788         stxa    %l5, [%i1+0x28]%asi
2789         stxa    %l6, [%i1+0x30]%asi
2790         stxa    %l7, [%i1+0x38]%asi
2791         ldda    [%i0+0x40]%asi, %l0
2792         ldda    [%i0+0x50]%asi, %l2
2793         ldda    [%i0+0x60]%asi, %l4
2794         ldda    [%i0+0x70]%asi, %l6
2795         stxa    %l0, [%i1+0x40]%asi
2796         stxa    %l1, [%i1+0x48]%asi
2797         stxa    %l2, [%i1+0x50]%asi
2798         stxa    %l3, [%i1+0x58]%asi
2799         stxa    %l4, [%i1+0x60]%asi
2800         stxa    %l5, [%i1+0x68]%asi
2801         stxa    %l6, [%i1+0x70]%asi
2802         stxa    %l7, [%i1+0x78]%asi
2803 
2804         add     %i0, 0x80, %i0
2805         subcc   %i2, 0x80, %i2
2806         bgu,pt  %xcc, 1b
2807         add     %i1, 0x80, %i1
2808 
2809         membar #Sync
2810         ret
2811         restore %g0, 0, %o0
2812         SET_SIZE(hwblkpagecopy)
2813 #endif  /* lint */
2814 
2815 
2816 /*
2817  * Transfer data to and from user space -
2818  * Note that these routines can cause faults
2819  * It is assumed that the kernel has nothing at
2820  * less than KERNELBASE in the virtual address space.
2821  *
2822  * Note that copyin(9F) and copyout(9F) are part of the
2823  * DDI/DKI which specifies that they return '-1' on "errors."
2824  *
2825  * Sigh.
2826  *
2827  * So there's two extremely similar routines - xcopyin() and xcopyout()
2828  * which return the errno that we've faithfully computed.  This
2829  * allows other callers (e.g. uiomove(9F)) to work correctly.
2830  * Given that these are used pretty heavily, we expand the calling
2831  * sequences inline for all flavours (rather than making wrappers).
2832  *
2833  * There are also stub routines for xcopyout_little and xcopyin_little,
2834  * which currently are intended to handle requests of <= 16 bytes from
2835  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2836  * is left as an exercise...
2837  */
2838 
2839 /*
2840  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2841  *
2842  * General theory of operation:
2843  *
2844  * None of the copyops routines grab a window until it's decided that
2845  * we need to do a HW block copy operation. This saves a window
2846  * spill/fill when we're called during socket ops. The typical IO
2847  * path won't cause spill/fill traps.
2848  *
2849  * This code uses a set of 4 limits for the maximum size that will
2850  * be copied given a particular input/output address alignment.
2851  * the default limits are:
2852  *
2853  * single byte aligned - 256 (hw_copy_limit_1)
2854  * two byte aligned - 512 (hw_copy_limit_2)
2855  * four byte aligned - 1024 (hw_copy_limit_4)
2856  * eight byte aligned - 1024 (hw_copy_limit_8)
2857  *
2858  * If the value for a particular limit is zero, the copy will be done
2859  * via the copy loops rather than block store/quad load instructions.
2860  *
2861  * Flow:
2862  *
2863  * If count == zero return zero.
2864  *
2865  * Store the previous lo_fault handler into %g6.
2866  * Place our secondary lofault handler into %g5.
2867  * Place the address of our nowindow fault handler into %o3.
2868  * Place the address of the windowed fault handler into %o4.
2869  * --> We'll use this handler if we end up grabbing a window
2870  * --> before we use block initializing store and quad load ASIs
2871  *
2872  * If count is less than or equal to SMALL_LIMIT (7) we
2873  * always do a byte for byte copy.
2874  *
2875  * If count is > SMALL_LIMIT, we check the alignment of the input
2876  * and output pointers. Based on the alignment we check count
2877  * against a limit based on detected alignment.  If we exceed the
2878  * alignment value we copy via block initializing store and quad
2879  * load instructions.
2880  *
2881  * If we don't exceed one of the limits, we store -count in %o3,
2882  * we store the number of chunks (8, 4, 2 or 1 byte) operated
2883  * on in our basic copy loop in %o2. Following this we branch 
2884  * to the appropriate copy loop and copy that many chunks.
2885  * Since we've been adding the chunk size to %o3 each time through
2886  * as well as decrementing %o2, we can tell if any data is
2887  * is left to be copied by examining %o3. If that is zero, we're
2888  * done and can go home. If not, we figure out what the largest
2889  * chunk size left to be copied is and branch to that copy loop
2890  * unless there's only one byte left. We load that as we're
2891  * branching to code that stores it just before we return.
2892  *
2893  * Fault handlers are invoked if we reference memory that has no
2894  * current mapping.  All forms share the same copyio_fault handler.
2895  * This routine handles fixing up the stack and general housecleaning.
2896  * Each copy operation has a simple fault handler that is then called
2897  * to do the work specific to the invidual operation.  The handler
2898  * for copyOP and xcopyOP are found at the end of individual function.
2899  * The handlers for xcopyOP_little are found at the end of xcopyin_little.
2900  * The handlers for copyOP_noerr are found at the end of copyin_noerr.
2901  */
2902 
2903 /*
2904  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2905  */
2906 
2907 #if defined(lint)
2908 
2909 /*ARGSUSED*/
2910 int
2911 copyout(const void *kaddr, void *uaddr, size_t count)
2912 { return (0); }
2913 
2914 #else   /* lint */
2915 
2916 /*
2917  * We save the arguments in the following registers in case of a fault:
2918  *      kaddr - %g2
2919  *      uaddr - %g3
2920  *      count - %g4
2921  */
2922 #define SAVE_SRC        %g2
2923 #define SAVE_DST        %g3
2924 #define SAVE_COUNT      %g4
2925 
2926 #define REAL_LOFAULT            %g5
2927 #define SAVED_LOFAULT           %g6
2928 
2929 /*
2930  * Generic copyio fault handler.  This is the first line of defense when a 
2931  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2932  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2933  * This allows us to share common code for all the flavors of the copy
2934  * operations, including the _noerr versions.
2935  *
2936  * Note that this function will restore the original input parameters before
2937  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2938  * member of the t_copyop structure, if needed.
2939  */
2940         ENTRY(copyio_fault)
2941 #if !defined(NIAGARA_IMPL)
2942         btst    FPUSED_FLAG, SAVED_LOFAULT
2943         bz      1f
2944         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2945 
2946         wr      %l5, 0, %gsr            ! restore gsr
2947 
2948         btst    FPRS_FEF, %g1
2949         bz      %icc, 4f
2950         nop
2951 
2952         ! restore fpregs from stack
2953         BLD_FP_FROMSTACK(%o2)
2954 
2955         ba,pt   %ncc, 1f
2956         nop
2957 4:
2958         FZERO                           ! zero all of the fpregs
2959         wr      %g1, %g0, %fprs         ! restore fprs
2960 1:
2961         restore
2962         mov     SAVE_SRC, %o0
2963         mov     SAVE_DST, %o1
2964         jmp     REAL_LOFAULT
2965         mov     SAVE_COUNT, %o2
2966 
2967 #else   /* NIAGARA_IMPL */
2968         membar  #Sync
2969         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2970         restore
2971         mov     SAVE_SRC, %o0
2972         mov     SAVE_DST, %o1
2973         jmp     REAL_LOFAULT
2974         mov     SAVE_COUNT, %o2
2975 
2976 #endif  /* NIAGARA_IMPL */
2977 
2978         SET_SIZE(copyio_fault)
2979 
2980         ENTRY(copyio_fault_nowindow)
2981         membar  #Sync
2982         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2983 
2984         mov     SAVE_SRC, %o0
2985         mov     SAVE_DST, %o1
2986         jmp     REAL_LOFAULT
2987         mov     SAVE_COUNT, %o2
2988         SET_SIZE(copyio_fault_nowindow)
2989 
2990         ENTRY(copyout)
2991         sethi   %hi(.copyout_err), REAL_LOFAULT
2992         or      REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2993 
2994 #if !defined(NIAGARA_IMPL)
2995 .do_copyout:
2996         tst     %o2                     ! check for zero count;  quick exit
2997         bz,pt   %ncc, .co_smallqx
2998         mov     %o0, SAVE_SRC
2999         mov     %o1, SAVE_DST
3000         mov     %o2, SAVE_COUNT
3001         cmp     %o2, FP_COPY            ! check for small copy/leaf case
3002         bgt,pt  %ncc, .co_copy_more
3003         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3004 /*
3005  * Small copy out code
3006  * 
3007  */
3008         sethi   %hi(copyio_fault_nowindow), %o3
3009         or      %o3, %lo(copyio_fault_nowindow), %o3
3010         membar  #Sync
3011         stn     %o3, [THREAD_REG + T_LOFAULT]
3012 
3013         mov     ASI_USER, %asi
3014         cmp     %o2, SHORTCOPY          ! make sure there is enough to align
3015         ble,pt  %ncc, .co_smallest
3016         andcc   %o1, 0x7, %o3           ! is dest long word aligned
3017         bnz,pn  %ncc, .co_align
3018         andcc   %o1, 1, %o3             ! is dest byte aligned
3019 
3020 ! Destination is long word aligned
3021 ! 8 cases for src alignment; load parts, store long words
3022 .co_al_src:
3023         andcc   %o0, 7, %o3
3024         brnz,pt %o3, .co_src_dst_unal8
3025         nop
3026 /*
3027  * Special case for handling when src and dest are both long word aligned
3028  * and total data to move is less than FP_COPY bytes
3029  * Also handles finish up for large block moves, so may be less than 32 bytes
3030  */
3031 .co_medlong:
3032         subcc   %o2, 31, %o2            ! adjust length to allow cc test
3033         ble,pt  %ncc, .co_medl31
3034         nop
3035 .co_medl32:
3036         ldx     [%o0], %o4              ! move 32 bytes
3037         subcc   %o2, 32, %o2            ! decrement length count by 32
3038         stxa    %o4, [%o1]%asi
3039         ldx     [%o0+8], %o4
3040         stxa    %o4, [%o1+8]%asi
3041         ldx     [%o0+16], %o4
3042         add     %o0, 32, %o0            ! increase src ptr by 32
3043         stxa    %o4, [%o1+16]%asi
3044         ldx     [%o0-8], %o4
3045         add     %o1, 32, %o1            ! increase dst ptr by 32
3046         bgu,pt  %ncc, .co_medl32        ! repeat if at least 32 bytes left
3047         stxa    %o4, [%o1-8]%asi
3048 .co_medl31:
3049         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3050         ble,pt  %ncc, .co_medl7         ! skip if 7 or fewer bytes left
3051         nop
3052 .co_medl8:
3053         ldx     [%o0], %o4              ! move 8 bytes
3054         add     %o0, 8, %o0             ! increase src ptr by 8
3055         subcc   %o2, 8, %o2             ! decrease count by 8
3056         add     %o1, 8, %o1             ! increase dst ptr by 8
3057         bgu,pt  %ncc, .co_medl8
3058         stxa    %o4, [%o1-8]%asi
3059 .co_medl7:
3060         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
3061         bnz,pt  %ncc, .co_small4        ! do final bytes if not finished
3062 
3063 .co_smallx:                             ! finish up and exit
3064         membar  #Sync
3065         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3066 .co_smallqx:
3067         retl
3068         mov     %g0, %o0
3069 
3070 .co_small4:
3071         cmp     %o2, 4
3072         blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
3073         nop                             !
3074         ld      [%o0], %o4              ! move 4 bytes
3075         add     %o0, 4, %o0             ! increase src ptr by 4
3076         add     %o1, 4, %o1             ! increase dst ptr by 4
3077         subcc   %o2, 4, %o2             ! decrease count by 4
3078         bz,pt   %ncc, .co_smallx
3079         stwa    %o4, [%o1-4]%asi
3080 
3081 .co_small3x:                            ! Exactly 1, 2, or 3 bytes remain
3082         subcc   %o2, 1, %o2             ! reduce count for cc test
3083         ldub    [%o0], %o4              ! load one byte
3084         bz,pt   %ncc, .co_smallx
3085         stba    %o4, [%o1]%asi          ! store one byte
3086         ldub    [%o0+1], %o4            ! load second byte
3087         subcc   %o2, 1, %o2
3088         bz,pt   %ncc, .co_smallx
3089         stba    %o4, [%o1+1]%asi        ! store second byte
3090         ldub    [%o0+2], %o4            ! load third byte
3091         ba      .co_smallx
3092         stba    %o4, [%o1+2]%asi        ! store third byte
3093 
3094 .co_smallest:                           ! 7 or fewer bytes remain
3095         cmp     %o2, 4
3096         blt,pt  %ncc, .co_small3x
3097         nop
3098         ldub    [%o0], %o4              ! read byte
3099         subcc   %o2, 4, %o2             ! reduce count by 4
3100         stba    %o4, [%o1]%asi          ! write byte
3101         ldub    [%o0+1], %o4            ! repeat for total of 4 bytes
3102         add     %o0, 4, %o0             ! advance src by 4
3103         stba    %o4, [%o1+1]%asi
3104         ldub    [%o0-2], %o4
3105         add     %o1, 4, %o1             ! advance dst by 4
3106         stba    %o4, [%o1-2]%asi
3107         ldub    [%o0-1], %o4
3108         bnz,pt  %ncc, .co_small3x
3109         stba    %o4, [%o1-1]%asi
3110         membar  #Sync
3111         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3112         retl
3113         mov     %g0, %o0
3114 
3115 .co_align:                              ! byte align test in prior branch delay
3116         bnz,pt  %ncc, .co_al_d1
3117 .co_al_d1f:                             ! dest is now half word aligned
3118         andcc   %o1, 2, %o3
3119         bnz,pt  %ncc, .co_al_d2
3120 .co_al_d2f:                             ! dest is now word aligned
3121         andcc   %o1, 4, %o3             ! is dest longword aligned?
3122         bz,pt   %ncc, .co_al_src
3123         nop
3124 .co_al_d4:                              ! dest is word aligned;  src is unknown
3125         ldub    [%o0], %o4              ! move a word (src align unknown)
3126         ldub    [%o0+1], %o3
3127         sll     %o4, 24, %o4            ! position
3128         sll     %o3, 16, %o3            ! position
3129         or      %o4, %o3, %o3           ! merge
3130         ldub    [%o0+2], %o4
3131         sll     %o4, 8, %o4             ! position
3132         or      %o4, %o3, %o3           ! merge
3133         ldub    [%o0+3], %o4
3134         or      %o4, %o3, %o4           ! merge
3135         stwa    %o4,[%o1]%asi           ! store four bytes
3136         add     %o0, 4, %o0             ! adjust src by 4
3137         add     %o1, 4, %o1             ! adjust dest by 4
3138         sub     %o2, 4, %o2             ! adjust count by 4
3139         andcc   %o0, 7, %o3             ! check for src long word alignment
3140         brz,pt  %o3, .co_medlong
3141 .co_src_dst_unal8:
3142         ! dst is 8-byte aligned, src is not
3143         ! Size is less than FP_COPY
3144         ! Following code is to select for alignment
3145         andcc   %o0, 0x3, %o3           ! test word alignment
3146         bz,pt   %ncc, .co_medword
3147         nop
3148         andcc   %o0, 0x1, %o3           ! test halfword alignment
3149         bnz,pt  %ncc, .co_med_byte      ! go to byte move if not halfword
3150         andcc   %o0, 0x2, %o3           ! test which byte alignment
3151         ba      .co_medhalf
3152         nop
3153 .co_al_d1:                              ! align dest to half word
3154         ldub    [%o0], %o4              ! move a byte
3155         add     %o0, 1, %o0
3156         stba    %o4, [%o1]%asi
3157         add     %o1, 1, %o1
3158         andcc   %o1, 2, %o3
3159         bz,pt   %ncc, .co_al_d2f
3160         sub     %o2, 1, %o2
3161 .co_al_d2:                              ! align dest to word
3162         ldub    [%o0], %o4              ! move a half-word (src align unknown)
3163         ldub    [%o0+1], %o3
3164         sll     %o4, 8, %o4             ! position
3165         or      %o4, %o3, %o4           ! merge
3166         stha    %o4, [%o1]%asi
3167         add     %o0, 2, %o0
3168         add     %o1, 2, %o1
3169         andcc   %o1, 4, %o3             ! is dest longword aligned?
3170         bz,pt   %ncc, .co_al_src
3171         sub     %o2, 2, %o2
3172         ba      .co_al_d4
3173         nop
3174 /*
3175  * Handle all cases where src and dest are aligned on word
3176  * boundaries. Use unrolled loops for better performance.
3177  * This option wins over standard large data move when 
3178  * source and destination is in cache for medium
3179  * to short data moves.
3180  */
3181 .co_medword:
3182         subcc   %o2, 31, %o2            ! adjust length to allow cc test
3183         ble,pt  %ncc, .co_medw31
3184         nop
3185 .co_medw32:
3186         ld      [%o0], %o4              ! move a block of 32 bytes
3187         stwa    %o4, [%o1]%asi
3188         ld      [%o0+4], %o4
3189         stwa    %o4, [%o1+4]%asi
3190         ld      [%o0+8], %o4
3191         stwa    %o4, [%o1+8]%asi
3192         ld      [%o0+12], %o4
3193         stwa    %o4, [%o1+12]%asi
3194         ld      [%o0+16], %o4
3195         stwa    %o4, [%o1+16]%asi
3196         ld      [%o0+20], %o4
3197         subcc   %o2, 32, %o2            ! decrement length count
3198         stwa    %o4, [%o1+20]%asi
3199         ld      [%o0+24], %o4
3200         add     %o0, 32, %o0            ! increase src ptr by 32
3201         stwa    %o4, [%o1+24]%asi
3202         ld      [%o0-4], %o4
3203         add     %o1, 32, %o1            ! increase dst ptr by 32
3204         bgu,pt  %ncc, .co_medw32        ! repeat if at least 32 bytes left
3205         stwa    %o4, [%o1-4]%asi
3206 .co_medw31:
3207         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3208         ble,pt  %ncc, .co_medw7         ! skip if 7 or fewer bytes left
3209         nop                             !
3210 .co_medw15:
3211         ld      [%o0], %o4              ! move a block of 8 bytes
3212         subcc   %o2, 8, %o2             ! decrement length count
3213         stwa    %o4, [%o1]%asi
3214         add     %o0, 8, %o0             ! increase src ptr by 8
3215         ld      [%o0-4], %o4
3216         add     %o1, 8, %o1             ! increase dst ptr by 8
3217         bgu,pt  %ncc, .co_medw15
3218         stwa    %o4, [%o1-4]%asi
3219 .co_medw7:
3220         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
3221         bz,pt   %ncc, .co_smallx        ! exit if finished
3222         cmp     %o2, 4
3223         blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
3224         nop                             !
3225         ld      [%o0], %o4              ! move 4 bytes
3226         add     %o0, 4, %o0             ! increase src ptr by 4
3227         add     %o1, 4, %o1             ! increase dst ptr by 4
3228         subcc   %o2, 4, %o2             ! decrease count by 4
3229         bnz     .co_small3x
3230         stwa    %o4, [%o1-4]%asi
3231         membar  #Sync
3232         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3233         retl
3234         mov     %g0, %o0
3235 
3236 .co_medhalf:
3237         subcc   %o2, 31, %o2            ! adjust length to allow cc test
3238         ble,pt  %ncc, .co_medh31
3239         nop
3240 .co_medh32:                             ! load and store block of 32 bytes
3241 
3242         lduh    [%o0], %o4              ! move 32 bytes
3243         subcc   %o2, 32, %o2            ! decrement length count
3244         lduw    [%o0+2], %o3
3245         sllx    %o4, 48, %o4
3246         sllx    %o3, 16, %o3
3247         or      %o4, %o3, %o3
3248         lduh    [%o0+6], %o4
3249         or      %o4, %o3, %o4
3250         stxa    %o4, [%o1]%asi
3251 
3252         lduh    [%o0+8], %o4
3253         lduw    [%o0+10], %o3
3254         sllx    %o4, 48, %o4
3255         sllx    %o3, 16, %o3
3256         or      %o4, %o3, %o3
3257         lduh    [%o0+14], %o4
3258         or      %o4, %o3, %o4
3259         stxa    %o4, [%o1+8]%asi
3260 
3261         lduh    [%o0+16], %o4
3262         lduw    [%o0+18], %o3
3263         sllx    %o4, 48, %o4
3264         sllx    %o3, 16, %o3
3265         or      %o4, %o3, %o3
3266         lduh    [%o0+22], %o4
3267         or      %o4, %o3, %o4
3268         stxa    %o4, [%o1+16]%asi
3269 
3270         add     %o0, 32, %o0            ! increase src ptr by 32
3271         add     %o1, 32, %o1            ! increase dst ptr by 32
3272 
3273         lduh    [%o0-8], %o4
3274         lduw    [%o0-6], %o3
3275         sllx    %o4, 48, %o4
3276         sllx    %o3, 16, %o3
3277         or      %o4, %o3, %o3
3278         lduh    [%o0-2], %o4
3279         or      %o3, %o4, %o4
3280         bgu,pt  %ncc, .co_medh32        ! repeat if at least 32 bytes left
3281         stxa    %o4, [%o1-8]%asi
3282 
3283 .co_medh31:
3284         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3285         ble,pt  %ncc, .co_medh7         ! skip if 7 or fewer bytes left
3286         nop                             !
3287 .co_medh15:
3288         lduh    [%o0], %o4              ! move 16 bytes
3289         subcc   %o2, 8, %o2             ! decrement length count
3290         lduw    [%o0+2], %o3
3291         sllx    %o4, 48, %o4
3292         sllx    %o3, 16, %o3
3293         or      %o4, %o3, %o3
3294         add     %o1, 8, %o1             ! increase dst ptr by 8
3295         lduh    [%o0+6], %o4
3296         add     %o0, 8, %o0             ! increase src ptr by 8
3297         or      %o4, %o3, %o4
3298         bgu,pt  %ncc, .co_medh15
3299         stxa    %o4, [%o1-8]%asi
3300 .co_medh7:
3301         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
3302         bz,pt   %ncc, .co_smallx        ! exit if finished
3303         cmp     %o2, 4
3304         blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
3305         nop                             !
3306         lduh    [%o0], %o4
3307         sll     %o4, 16, %o4
3308         lduh    [%o0+2], %o3
3309         or      %o3, %o4, %o4
3310         subcc   %o2, 4, %o2
3311         add     %o0, 4, %o0
3312         add     %o1, 4, %o1
3313         bnz     .co_small3x
3314         stwa    %o4, [%o1-4]%asi
3315         membar  #Sync
3316         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3317         retl
3318         mov     %g0, %o0
3319 
3320         .align 16
3321 .co_med_byte:
3322         bnz,pt  %ncc, .co_medbh32a      ! go to correct byte move
3323         subcc   %o2, 31, %o2            ! adjust length to allow cc test
3324         ble,pt  %ncc, .co_medb31
3325         nop
3326 .co_medb32:                             ! Alignment 1 or 5
3327         subcc   %o2, 32, %o2            ! decrement length count
3328 
3329         ldub    [%o0], %o4              ! load and store a block of 32 bytes
3330         sllx    %o4, 56, %o3
3331         lduh    [%o0+1], %o4
3332         sllx    %o4, 40, %o4
3333         or      %o4, %o3, %o3
3334         lduw    [%o0+3], %o4
3335         sllx    %o4, 8, %o4
3336         or      %o4, %o3, %o3
3337         ldub    [%o0+7], %o4
3338         or      %o4, %o3, %o4
3339         stxa    %o4, [%o1]%asi
3340 
3341         ldub    [%o0+8], %o4
3342         sllx    %o4, 56, %o3
3343         lduh    [%o0+9], %o4
3344         sllx    %o4, 40, %o4
3345         or      %o4, %o3, %o3
3346         lduw    [%o0+11], %o4
3347         sllx    %o4, 8, %o4
3348         or      %o4, %o3, %o3
3349         ldub    [%o0+15], %o4
3350         or      %o4, %o3, %o4
3351         stxa    %o4, [%o1+8]%asi
3352 
3353         ldub    [%o0+16], %o4
3354         sllx    %o4, 56, %o3
3355         lduh    [%o0+17], %o4
3356         sllx    %o4, 40, %o4
3357         or      %o4, %o3, %o3
3358         lduw    [%o0+19], %o4
3359         sllx    %o4, 8, %o4
3360         or      %o4, %o3, %o3
3361         ldub    [%o0+23], %o4
3362         or      %o4, %o3, %o4
3363         stxa    %o4, [%o1+16]%asi
3364 
3365         add     %o0, 32, %o0            ! increase src ptr by 32
3366         add     %o1, 32, %o1            ! increase dst ptr by 32
3367 
3368         ldub    [%o0-8], %o4
3369         sllx    %o4, 56, %o3
3370         lduh    [%o0-7], %o4
3371         sllx    %o4, 40, %o4
3372         or      %o4, %o3, %o3
3373         lduw    [%o0-5], %o4
3374         sllx    %o4, 8, %o4
3375         or      %o4, %o3, %o3
3376         ldub    [%o0-1], %o4
3377         or      %o4, %o3, %o4
3378         bgu,pt  %ncc, .co_medb32        ! repeat if at least 32 bytes left
3379         stxa    %o4, [%o1-8]%asi
3380 
3381 .co_medb31:                             ! 31 or fewer bytes remaining
3382         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3383         ble,pt  %ncc, .co_medb7         ! skip if 7 or fewer bytes left
3384         nop                             !
3385 .co_medb15:
3386 
3387         ldub    [%o0], %o4              ! load and store a block of 8 bytes
3388         subcc   %o2, 8, %o2             ! decrement length count
3389         sllx    %o4, 56, %o3
3390         lduh    [%o0+1], %o4
3391         sllx    %o4, 40, %o4
3392         or      %o4, %o3, %o3
3393         lduw    [%o0+3], %o4
3394         add     %o1, 8, %o1             ! increase dst ptr by 16
3395         sllx    %o4, 8, %o4
3396         or      %o4, %o3, %o3
3397         ldub    [%o0+7], %o4
3398         add     %o0, 8, %o0             ! increase src ptr by 16
3399         or      %o4, %o3, %o4
3400         bgu,pt  %ncc, .co_medb15
3401         stxa    %o4, [%o1-8]%asi
3402 .co_medb7:
3403         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
3404         bz,pt   %ncc, .co_smallx        ! exit if finished
3405         cmp     %o2, 4
3406         blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
3407         nop                             !
3408         ldub    [%o0], %o4              ! move 4 bytes
3409         sll     %o4, 24, %o3
3410         lduh    [%o0+1], %o4
3411         sll     %o4, 8, %o4
3412         or      %o4, %o3, %o3
3413         ldub    [%o0+3], %o4
3414         or      %o4, %o3, %o4
3415         subcc   %o2, 4, %o2
3416         add     %o0, 4, %o0
3417         add     %o1, 4, %o1
3418         bnz     .co_small3x
3419         stwa    %o4, [%o1-4]%asi
3420         membar  #Sync
3421         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3422         retl
3423         mov     %g0, %o0
3424 
3425         .align 16
3426 .co_medbh32a:
3427         ble,pt  %ncc, .co_medbh31
3428         nop
3429 .co_medbh32:                            ! Alignment 3 or 7
3430         subcc   %o2, 32, %o2            ! decrement length count
3431 
3432         ldub    [%o0], %o4              ! load and store a block of 32 bytes
3433         sllx    %o4, 56, %o3
3434         lduw    [%o0+1], %o4
3435         sllx    %o4, 24, %o4
3436         or      %o4, %o3, %o3
3437         lduh    [%o0+5], %o4
3438         sllx    %o4, 8, %o4
3439         or      %o4, %o3, %o3
3440         ldub    [%o0+7], %o4
3441         or      %o4, %o3, %o4
3442         stxa    %o4, [%o1]%asi
3443 
3444         ldub    [%o0+8], %o4
3445         sllx    %o4, 56, %o3
3446         lduw    [%o0+9], %o4
3447         sllx    %o4, 24, %o4
3448         or      %o4, %o3, %o3
3449         lduh    [%o0+13], %o4
3450         sllx    %o4, 8, %o4
3451         or      %o4, %o3, %o3
3452         ldub    [%o0+15], %o4
3453         or      %o4, %o3, %o4
3454         stxa    %o4, [%o1+8]%asi
3455 
3456         ldub    [%o0+16], %o4
3457         sllx    %o4, 56, %o3
3458         lduw    [%o0+17], %o4
3459         sllx    %o4, 24, %o4
3460         or      %o4, %o3, %o3
3461         lduh    [%o0+21], %o4
3462         sllx    %o4, 8, %o4
3463         or      %o4, %o3, %o3
3464         ldub    [%o0+23], %o4
3465         or      %o4, %o3, %o4
3466         stxa    %o4, [%o1+16]%asi
3467 
3468         add     %o0, 32, %o0            ! increase src ptr by 32
3469         add     %o1, 32, %o1            ! increase dst ptr by 32
3470 
3471         ldub    [%o0-8], %o4
3472         sllx    %o4, 56, %o3
3473         lduw    [%o0-7], %o4
3474         sllx    %o4, 24, %o4
3475         or      %o4, %o3, %o3
3476         lduh    [%o0-3], %o4
3477         sllx    %o4, 8, %o4
3478         or      %o4, %o3, %o3
3479         ldub    [%o0-1], %o4
3480         or      %o4, %o3, %o4
3481         bgu,pt  %ncc, .co_medbh32       ! repeat if at least 32 bytes left
3482         stxa    %o4, [%o1-8]%asi
3483 
3484 .co_medbh31:
3485         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3486         ble,pt  %ncc, .co_medb7         ! skip if 7 or fewer bytes left
3487         nop                             !
3488 .co_medbh15:
3489         ldub    [%o0], %o4              ! load and store a block of 8 bytes
3490         sllx    %o4, 56, %o3
3491         lduw    [%o0+1], %o4
3492         sllx    %o4, 24, %o4
3493         or      %o4, %o3, %o3
3494         lduh    [%o0+5], %o4
3495         sllx    %o4, 8, %o4
3496         or      %o4, %o3, %o3
3497         ldub    [%o0+7], %o4
3498         or      %o4, %o3, %o4
3499         stxa    %o4, [%o1]%asi
3500         subcc   %o2, 8, %o2             ! decrement length count
3501         add     %o1, 8, %o1             ! increase dst ptr by 8
3502         add     %o0, 8, %o0             ! increase src ptr by 8
3503         bgu,pt  %ncc, .co_medbh15
3504         stxa    %o4, [%o1-8]%asi
3505         ba      .co_medb7
3506         nop
3507 /*
3508  * End of small copy (no window) code
3509  */
3510 
3511 /*
3512  * Long copy code
3513  */
3514 .co_copy_more:
3515         sethi   %hi(copyio_fault), %o3
3516         or      %o3, %lo(copyio_fault), %o3
3517         membar  #Sync
3518         stn     %o3, [THREAD_REG + T_LOFAULT]
3519 
3520 /*
3521  * Following code is for large copies. We know there is at
3522  * least FP_COPY bytes available. FP regs are used, so
3523  *  we save registers and fp regs before starting
3524  */
3525         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3526         or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3527         rd      %fprs, %g1              ! check for unused fp
3528         ! if fprs.fef == 0, set it.
3529         ! Setting it when already set costs more than checking
3530         andcc   %g1, FPRS_FEF, %g1      ! test FEF, fprs.du = fprs.dl = 0
3531         bz,pt   %ncc, .co_fp_unused
3532         mov     ASI_USER, %asi
3533         BST_FP_TOSTACK(%o3)
3534         ba      .co_fp_ready
3535 .co_fp_unused:
3536         prefetch [%i0 + (1 * CACHE_LINE)], #one_read
3537         wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
3538 .co_fp_ready:
3539         rd      %gsr, %l5               ! save %gsr value
3540         andcc   %i1, 1, %o3             ! is dest byte aligned
3541         bnz,pt  %ncc, .co_big_d1
3542 .co_big_d1f:                            ! dest is now half word aligned
3543         andcc   %i1, 2, %o3
3544         bnz,pt  %ncc, .co_big_d2
3545 .co_big_d2f:                            ! dest is now word aligned
3546         andcc   %i1, 4, %o3             ! is dest longword aligned
3547         bnz,pt  %ncc, .co_big_d4
3548 .co_big_d4f:                            ! dest is now long word aligned
3549         andcc   %i0, 7, %o3             ! is src long word aligned
3550         brnz,pt %o3, .co_big_unal8
3551         prefetch [%i0 + (2 * CACHE_LINE)], #one_read
3552         ! Src and dst are long word aligned
3553         ! align dst to 64 byte boundary
3554         andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
3555         brz,pn  %o3, .co_al_to_64
3556         nop
3557         sub     %o3, 64, %o3            ! %o3 has negative bytes to move
3558         add     %i2, %o3, %i2           ! adjust remaining count
3559         andcc   %o3, 8, %o4             ! odd long words to move?
3560         brz,pt  %o4, .co_al_to_16
3561         nop
3562         add     %o3, 8, %o3
3563         ldx     [%i0], %o4
3564         add     %i0, 8, %i0             ! increment src ptr
3565         stxa    %o4, [%i1]ASI_USER
3566         add     %i1, 8, %i1             ! increment dst ptr
3567 ! Dest is aligned on 16 bytes, src 8 byte aligned
3568 .co_al_to_16:
3569         andcc   %o3, 0x30, %o4          ! move to move?
3570         brz,pt  %o4, .co_al_to_64
3571         nop
3572 .co_al_mv_16:
3573         add     %o3, 16, %o3
3574         ldx     [%i0], %o4
3575         stxa    %o4, [%i1]ASI_USER
3576         add     %i0, 16, %i0            ! increment src ptr
3577         ldx     [%i0-8], %o4
3578         add     %i1, 8, %i1             ! increment dst ptr
3579         stxa    %o4, [%i1]ASI_USER
3580         andcc   %o3, 0x30, %o4
3581         brnz,pt %o4, .co_al_mv_16
3582         add     %i1, 8, %i1             ! increment dst ptr
3583 ! Dest is aligned on 64 bytes, src 8 byte aligned
3584 .co_al_to_64:
3585         ! Determine source alignment
3586         ! to correct 8 byte offset
3587         andcc   %i0, 32, %o3
3588         brnz,pn %o3, .co_aln_1
3589         andcc   %i0, 16, %o3
3590         brnz,pn %o3, .co_aln_01
3591         andcc   %i0, 8, %o3
3592         brz,pn  %o3, .co_aln_000
3593         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3594         ba      .co_aln_001
3595         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3596 .co_aln_01:
3597         brnz,pn %o3, .co_aln_011
3598         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3599         ba      .co_aln_010
3600         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3601 .co_aln_1:
3602         andcc   %i0, 16, %o3
3603         brnz,pn %o3, .co_aln_11
3604         andcc   %i0, 8, %o3
3605         brnz,pn %o3, .co_aln_101
3606         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3607         ba      .co_aln_100
3608         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3609 .co_aln_11:
3610         brz,pn  %o3, .co_aln_110
3611         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3612 
3613 .co_aln_111:
3614 ! Alignment off by 8 bytes
3615         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3616         ldd     [%i0], %d0
3617         add     %i0, 8, %i0
3618         sub     %i2, 8, %i2
3619         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3620         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3621         sub     %i1, %i0, %i1
3622 .co_aln_111_loop:
3623         ldda    [%i0]ASI_BLK_P,%d16             ! block load
3624         subcc   %o3, 64, %o3
3625         fmovd   %d16, %d2
3626         fmovd   %d18, %d4
3627         fmovd   %d20, %d6
3628         fmovd   %d22, %d8
3629         fmovd   %d24, %d10
3630         fmovd   %d26, %d12
3631         fmovd   %d28, %d14
3632         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3633         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3634         add     %i0, 64, %i0
3635         fmovd   %d30, %d0
3636         bgt,pt  %ncc, .co_aln_111_loop
3637         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3638         add     %i1, %i0, %i1
3639 
3640         stda    %d0, [%i1]ASI_USER
3641         ba      .co_remain_stuff
3642         add     %i1, 8, %i1
3643         ! END OF aln_111
3644 
3645 .co_aln_110:
3646 ! Alignment off by 16 bytes
3647         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3648         ldd     [%i0], %d0
3649         ldd     [%i0+8], %d2
3650         add     %i0, 16, %i0
3651         sub     %i2, 16, %i2
3652         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3653         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3654         sub     %i1, %i0, %i1
3655 .co_aln_110_loop:
3656         ldda    [%i0]ASI_BLK_P,%d16             ! block load
3657         subcc   %o3, 64, %o3
3658         fmovd   %d16, %d4
3659         fmovd   %d18, %d6
3660         fmovd   %d20, %d8
3661         fmovd   %d22, %d10
3662         fmovd   %d24, %d12
3663         fmovd   %d26, %d14
3664         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3665         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3666         add     %i0, 64, %i0
3667         fmovd   %d28, %d0
3668         fmovd   %d30, %d2
3669         bgt,pt  %ncc, .co_aln_110_loop
3670         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3671         add     %i1, %i0, %i1
3672 
3673         stda    %d0, [%i1]%asi
3674         stda    %d2, [%i1+8]%asi
3675         ba      .co_remain_stuff
3676         add     %i1, 16, %i1
3677         ! END OF aln_110
3678 
3679 .co_aln_101:
3680 ! Alignment off by 24 bytes
3681         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3682         ldd     [%i0], %d0
3683         ldd     [%i0+8], %d2
3684         ldd     [%i0+16], %d4
3685         add     %i0, 24, %i0
3686         sub     %i2, 24, %i2
3687         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3688         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3689         sub     %i1, %i0, %i1
3690 .co_aln_101_loop:
3691         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3692         subcc   %o3, 64, %o3
3693         fmovd   %d16, %d6
3694         fmovd   %d18, %d8
3695         fmovd   %d20, %d10
3696         fmovd   %d22, %d12
3697         fmovd   %d24, %d14
3698         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3699         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3700         add     %i0, 64, %i0
3701         fmovd   %d26, %d0
3702         fmovd   %d28, %d2
3703         fmovd   %d30, %d4
3704         bgt,pt  %ncc, .co_aln_101_loop
3705         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3706         add     %i1, %i0, %i1
3707 
3708         stda    %d0, [%i1]%asi
3709         stda    %d2, [%i1+8]%asi
3710         stda    %d4, [%i1+16]%asi
3711         ba      .co_remain_stuff
3712         add     %i1, 24, %i1
3713         ! END OF aln_101
3714 
3715 .co_aln_100:
3716 ! Alignment off by 32 bytes
3717         ldd     [%i0], %d0
3718         ldd     [%i0+8], %d2
3719         ldd     [%i0+16],%d4
3720         ldd     [%i0+24],%d6
3721         add     %i0, 32, %i0
3722         sub     %i2, 32, %i2
3723         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3724         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3725         sub     %i1, %i0, %i1
3726 .co_aln_100_loop:
3727         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3728         subcc   %o3, 64, %o3
3729         fmovd   %d16, %d8
3730         fmovd   %d18, %d10
3731         fmovd   %d20, %d12
3732         fmovd   %d22, %d14
3733         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3734         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3735         add     %i0, 64, %i0
3736         fmovd   %d24, %d0
3737         fmovd   %d26, %d2
3738         fmovd   %d28, %d4
3739         fmovd   %d30, %d6
3740         bgt,pt  %ncc, .co_aln_100_loop
3741         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3742         add     %i1, %i0, %i1
3743 
3744         stda    %d0, [%i1]%asi
3745         stda    %d2, [%i1+8]%asi
3746         stda    %d4, [%i1+16]%asi
3747         stda    %d6, [%i1+24]%asi
3748         ba      .co_remain_stuff
3749         add     %i1, 32, %i1
3750         ! END OF aln_100
3751 
3752 .co_aln_011:
3753 ! Alignment off by 40 bytes
3754         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3755         ldd     [%i0], %d0
3756         ldd     [%i0+8], %d2
3757         ldd     [%i0+16], %d4
3758         ldd     [%i0+24], %d6
3759         ldd     [%i0+32], %d8
3760         add     %i0, 40, %i0
3761         sub     %i2, 40, %i2
3762         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3763         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3764         sub     %i1, %i0, %i1
3765 .co_aln_011_loop:
3766         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3767         subcc   %o3, 64, %o3
3768         fmovd   %d16, %d10
3769         fmovd   %d18, %d12
3770         fmovd   %d20, %d14
3771         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3772         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3773         add     %i0, 64, %i0
3774         fmovd   %d22, %d0
3775         fmovd   %d24, %d2
3776         fmovd   %d26, %d4
3777         fmovd   %d28, %d6
3778         fmovd   %d30, %d8
3779         bgt,pt  %ncc, .co_aln_011_loop
3780         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3781         add     %i1, %i0, %i1
3782 
3783         stda    %d0, [%i1]%asi
3784         stda    %d2, [%i1+8]%asi
3785         stda    %d4, [%i1+16]%asi
3786         stda    %d6, [%i1+24]%asi
3787         stda    %d8, [%i1+32]%asi
3788         ba      .co_remain_stuff
3789         add     %i1, 40, %i1
3790         ! END OF aln_011
3791 
3792 .co_aln_010:
3793 ! Alignment off by 48 bytes
3794         ldd     [%i0], %d0
3795         ldd     [%i0+8], %d2
3796         ldd     [%i0+16], %d4
3797         ldd     [%i0+24], %d6
3798         ldd     [%i0+32], %d8
3799         ldd     [%i0+40], %d10
3800         add     %i0, 48, %i0
3801         sub     %i2, 48, %i2
3802         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3803         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3804         sub     %i1, %i0, %i1
3805 .co_aln_010_loop:
3806         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3807         subcc   %o3, 64, %o3
3808         fmovd   %d16, %d12
3809         fmovd   %d18, %d14
3810         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3811         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3812         add     %i0, 64, %i0
3813         fmovd   %d20, %d0
3814         fmovd   %d22, %d2
3815         fmovd   %d24, %d4
3816         fmovd   %d26, %d6
3817         fmovd   %d28, %d8
3818         fmovd   %d30, %d10
3819         bgt,pt  %ncc, .co_aln_010_loop
3820         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3821         add     %i1, %i0, %i1
3822 
3823         stda    %d0, [%i1]%asi
3824         stda    %d2, [%i1+8]%asi
3825         stda    %d4, [%i1+16]%asi
3826         stda    %d6, [%i1+24]%asi
3827         stda    %d8, [%i1+32]%asi
3828         stda    %d10, [%i1+40]%asi
3829         ba      .co_remain_stuff
3830         add     %i1, 48, %i1
3831         ! END OF aln_010
3832 
3833 .co_aln_001:
3834 ! Alignment off by 56 bytes
3835         ldd     [%i0], %d0
3836         ldd     [%i0+8], %d2
3837         ldd     [%i0+16], %d4
3838         ldd     [%i0+24], %d6
3839         ldd     [%i0+32], %d8
3840         ldd     [%i0+40], %d10
3841         ldd     [%i0+48], %d12
3842         add     %i0, 56, %i0
3843         sub     %i2, 56, %i2
3844         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3845         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3846         sub     %i1, %i0, %i1
3847 .co_aln_001_loop:
3848         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3849         subcc   %o3, 64, %o3
3850         fmovd   %d16, %d14
3851         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3852         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3853         add     %i0, 64, %i0
3854         fmovd   %d18, %d0
3855         fmovd   %d20, %d2
3856         fmovd   %d22, %d4
3857         fmovd   %d24, %d6
3858         fmovd   %d26, %d8
3859         fmovd   %d28, %d10
3860         fmovd   %d30, %d12
3861         bgt,pt  %ncc, .co_aln_001_loop
3862         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3863         add     %i1, %i0, %i1
3864 
3865         stda    %d0, [%i1]%asi
3866         stda    %d2, [%i1+8]%asi
3867         stda    %d4, [%i1+16]%asi
3868         stda    %d6, [%i1+24]%asi
3869         stda    %d8, [%i1+32]%asi
3870         stda    %d10, [%i1+40]%asi
3871         stda    %d12, [%i1+48]%asi
3872         ba      .co_remain_stuff
3873         add     %i1, 56, %i1
3874         ! END OF aln_001
3875 
3876 .co_aln_000:
3877         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3878         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3879         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3880         sub     %i1, %i0, %i1
3881 .co_aln_000_loop:
3882         ldda    [%i0]ASI_BLK_P,%d0
3883         subcc   %o3, 64, %o3
3884         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3885         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3886         add     %i0, 64, %i0
3887         bgt,pt  %ncc, .co_aln_000_loop
3888         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3889         add     %i1, %i0, %i1
3890 
3891         ! END OF aln_000
3892 
3893 .co_remain_stuff:
3894         subcc   %i2, 31, %i2            ! adjust length to allow cc test
3895         ble,pt  %ncc, .co_aln_31
3896         nop
3897 .co_aln_32:
3898         ldx     [%i0], %o4              ! move 32 bytes
3899         subcc   %i2, 32, %i2            ! decrement length count by 32
3900         stxa    %o4, [%i1]%asi
3901         ldx     [%i0+8], %o4
3902         stxa    %o4, [%i1+8]%asi
3903         ldx     [%i0+16], %o4
3904         add     %i0, 32, %i0            ! increase src ptr by 32
3905         stxa    %o4, [%i1+16]%asi
3906         ldx     [%i0-8], %o4
3907         add     %i1, 32, %i1            ! increase dst ptr by 32
3908         bgu,pt  %ncc, .co_aln_32        ! repeat if at least 32 bytes left
3909         stxa    %o4, [%i1-8]%asi
3910 .co_aln_31:
3911         addcc   %i2, 24, %i2            ! adjust count to be off by 7
3912         ble,pt  %ncc, .co_aln_7         ! skip if 7 or fewer bytes left
3913         nop                             !
3914 .co_aln_15:
3915         ldx     [%i0], %o4              ! move 8 bytes
3916         add     %i0, 8, %i0             ! increase src ptr by 8
3917         subcc   %i2, 8, %i2             ! decrease count by 8
3918         add     %i1, 8, %i1             ! increase dst ptr by 8
3919         bgu,pt  %ncc, .co_aln_15
3920         stxa    %o4, [%i1-8]%asi
3921 .co_aln_7:
3922         addcc   %i2, 7, %i2             ! finish adjustment of remaining count
3923         bz,pt   %ncc, .co_exit          ! exit if finished
3924         cmp     %i2, 4
3925         blt,pt  %ncc, .co_unaln3x       ! skip if less than 4 bytes left
3926         nop                             !
3927         ld      [%i0], %o4              ! move 4 bytes
3928         add     %i0, 4, %i0             ! increase src ptr by 4
3929         add     %i1, 4, %i1             ! increase dst ptr by 4
3930         subcc   %i2, 4, %i2             ! decrease count by 4
3931         bnz     .co_unaln3x
3932         stwa    %o4, [%i1-4]%asi
3933         ba      .co_exit
3934         nop
3935 
3936         ! destination alignment code
3937 .co_big_d1:
3938         ldub    [%i0], %o4              ! move a byte
3939         add     %i0, 1, %i0
3940         stba    %o4, [%i1]ASI_USER
3941         add     %i1, 1, %i1
3942         andcc   %i1, 2, %o3
3943         bz,pt   %ncc, .co_big_d2f
3944         sub     %i2, 1, %i2
3945 .co_big_d2:
3946         ldub    [%i0], %o4              ! move a half-word (src align unknown)
3947         ldub    [%i0+1], %o3
3948         add     %i0, 2, %i0
3949         sll     %o4, 8, %o4             ! position
3950         or      %o4, %o3, %o4           ! merge
3951         stha    %o4, [%i1]ASI_USER
3952         add     %i1, 2, %i1
3953         andcc   %i1, 4, %o3             ! is dest longword aligned
3954         bz,pt   %ncc, .co_big_d4f
3955         sub     %i2, 2, %i2
3956 .co_big_d4:                             ! dest is at least word aligned
3957         nop
3958         ldub    [%i0], %o4              ! move a word (src align unknown)
3959         ldub    [%i0+1], %o3
3960         sll     %o4, 24, %o4            ! position
3961         sll     %o3, 16, %o3            ! position
3962         or      %o4, %o3, %o3           ! merge
3963         ldub    [%i0+2], %o4
3964         sll     %o4, 8, %o4             ! position
3965         or      %o4, %o3, %o3           ! merge
3966         ldub    [%i0+3], %o4
3967         or      %o4, %o3, %o4           ! merge
3968         stwa    %o4,[%i1]ASI_USER       ! store four bytes
3969         add     %i0, 4, %i0             ! adjust src by 4
3970         add     %i1, 4, %i1             ! adjust dest by 4
3971         ba      .co_big_d4f
3972         sub     %i2, 4, %i2             ! adjust count by 4
3973 
3974 
3975         ! Dst is on 8 byte boundary; src is not;
3976 .co_big_unal8:
3977         andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
3978         bz      %ncc, .co_unalnsrc
3979         sub     %o3, 64, %o3            ! %o3 will be multiple of 8
3980         neg     %o3                     ! bytes until dest is 64 byte aligned
3981         sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
3982         ! Move bytes according to source alignment
3983         andcc   %i0, 0x1, %o4
3984         bnz     %ncc, .co_unalnbyte     ! check for byte alignment
3985         nop
3986         andcc   %i0, 2, %o4             ! check for half word alignment
3987         bnz     %ncc, .co_unalnhalf
3988         nop
3989         ! Src is word aligned, move bytes until dest 64 byte aligned
3990 .co_unalnword:
3991         ld      [%i0], %o4              ! load 4 bytes
3992         stwa    %o4, [%i1]%asi          ! and store 4 bytes
3993         ld      [%i0+4], %o4            ! load 4 bytes
3994         add     %i0, 8, %i0             ! increase src ptr by 8
3995         stwa    %o4, [%i1+4]%asi        ! and store 4 bytes
3996         subcc   %o3, 8, %o3             ! decrease count by 8
3997         bnz     %ncc, .co_unalnword
3998         add     %i1, 8, %i1             ! increase dst ptr by 8
3999         ba      .co_unalnsrc
4000         nop
4001 
4002         ! Src is half-word aligned, move bytes until dest 64 byte aligned
4003 .co_unalnhalf:
4004         lduh    [%i0], %o4              ! load 2 bytes
4005         sllx    %o4, 32, %i3            ! shift left
4006         lduw    [%i0+2], %o4
4007         or      %o4, %i3, %i3
4008         sllx    %i3, 16, %i3
4009         lduh    [%i0+6], %o4
4010         or      %o4, %i3, %i3
4011         stxa    %i3, [%i1]ASI_USER
4012         add     %i0, 8, %i0
4013         subcc   %o3, 8, %o3
4014         bnz     %ncc, .co_unalnhalf
4015         add     %i1, 8, %i1
4016         ba      .co_unalnsrc
4017         nop
4018 
4019         ! Src is Byte aligned, move bytes until dest 64 byte aligned
4020 .co_unalnbyte:
4021         sub     %i1, %i0, %i1           ! share pointer advance
4022 .co_unalnbyte_loop:
4023         ldub    [%i0], %o4
4024         sllx    %o4, 56, %i3
4025         lduh    [%i0+1], %o4
4026         sllx    %o4, 40, %o4
4027         or      %o4, %i3, %i3
4028         lduh    [%i0+3], %o4
4029         sllx    %o4, 24, %o4
4030         or      %o4, %i3, %i3
4031         lduh    [%i0+5], %o4
4032         sllx    %o4, 8, %o4
4033         or      %o4, %i3, %i3
4034         ldub    [%i0+7], %o4
4035         or      %o4, %i3, %i3
4036         stxa    %i3, [%i1+%i0]ASI_USER
4037         subcc   %o3, 8, %o3
4038         bnz     %ncc, .co_unalnbyte_loop
4039         add     %i0, 8, %i0
4040         add     %i1,%i0, %i1            ! restore pointer
4041 
4042         ! Destination is now block (64 byte aligned), src is not 8 byte aligned
4043 .co_unalnsrc:
4044         andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
4045         and     %i2, 0x3f, %i2          ! residue bytes in %i2
4046         add     %i2, 64, %i2            ! Insure we don't load beyond
4047         sub     %i3, 64, %i3            ! end of source buffer
4048 
4049         andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
4050         prefetch [%o4 + (3 * CACHE_LINE)], #one_read
4051         alignaddr %i0, %g0, %g0         ! generate %gsr
4052         add     %i0, %i3, %i0           ! advance %i0 to after blocks
4053         !
4054         ! Determine source alignment to correct 8 byte offset
4055         andcc   %i0, 0x20, %o3
4056         brnz,pn %o3, .co_unaln_1
4057         andcc   %i0, 0x10, %o3
4058         brnz,pn %o3, .co_unaln_01
4059         andcc   %i0, 0x08, %o3
4060         brz,a   %o3, .co_unaln_000
4061         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4062         ba      .co_unaln_001
4063         nop
4064 .co_unaln_01:
4065         brnz,a  %o3, .co_unaln_011
4066         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4067         ba      .co_unaln_010
4068         nop
4069 .co_unaln_1:
4070         brnz,pn %o3, .co_unaln_11
4071         andcc   %i0, 0x08, %o3
4072         brnz,a  %o3, .co_unaln_101
4073         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4074         ba      .co_unaln_100
4075         nop
4076 .co_unaln_11:
4077         brz,pn  %o3, .co_unaln_110
4078         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
4079 
4080 .co_unaln_111:
4081         ldd     [%o4+56], %d14
4082 .co_unaln_111_loop:
4083         add     %o4, 64, %o4
4084         ldda    [%o4]ASI_BLK_P, %d16
4085         faligndata %d14, %d16, %d48
4086         faligndata %d16, %d18, %d50
4087         faligndata %d18, %d20, %d52
4088         faligndata %d20, %d22, %d54
4089         faligndata %d22, %d24, %d56
4090         faligndata %d24, %d26, %d58
4091         faligndata %d26, %d28, %d60
4092         faligndata %d28, %d30, %d62
4093         fmovd   %d30, %d14
4094         stda    %d48, [%i1]ASI_BLK_AIUS
4095         subcc   %i3, 64, %i3
4096         add     %i1, 64, %i1
4097         bgu,pt  %ncc, .co_unaln_111_loop
4098         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4099         ba      .co_unaln_done
4100         nop
4101 
4102 .co_unaln_110:
4103         ldd     [%o4+48], %d12
4104         ldd     [%o4+56], %d14
4105 .co_unaln_110_loop:
4106         add     %o4, 64, %o4
4107         ldda    [%o4]ASI_BLK_P, %d16
4108         faligndata %d12, %d14, %d48
4109         faligndata %d14, %d16, %d50
4110         faligndata %d16, %d18, %d52
4111         faligndata %d18, %d20, %d54
4112         faligndata %d20, %d22, %d56
4113         faligndata %d22, %d24, %d58
4114         faligndata %d24, %d26, %d60
4115         faligndata %d26, %d28, %d62
4116         fmovd   %d28, %d12
4117         fmovd   %d30, %d14
4118         stda    %d48, [%i1]ASI_BLK_AIUS
4119         subcc   %i3, 64, %i3
4120         add     %i1, 64, %i1
4121         bgu,pt  %ncc, .co_unaln_110_loop
4122         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4123         ba      .co_unaln_done
4124         nop
4125 
4126 .co_unaln_101:
4127         ldd     [%o4+40], %d10
4128         ldd     [%o4+48], %d12
4129         ldd     [%o4+56], %d14
4130 .co_unaln_101_loop:
4131         add     %o4, 64, %o4
4132         ldda    [%o4]ASI_BLK_P, %d16
4133         faligndata %d10, %d12, %d48
4134         faligndata %d12, %d14, %d50
4135         faligndata %d14, %d16, %d52
4136         faligndata %d16, %d18, %d54
4137         faligndata %d18, %d20, %d56
4138         faligndata %d20, %d22, %d58
4139         faligndata %d22, %d24, %d60
4140         faligndata %d24, %d26, %d62
4141         fmovd   %d26, %d10
4142         fmovd   %d28, %d12
4143         fmovd   %d30, %d14
4144         stda    %d48, [%i1]ASI_BLK_AIUS
4145         subcc   %i3, 64, %i3
4146         add     %i1, 64, %i1
4147         bgu,pt  %ncc, .co_unaln_101_loop
4148         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4149         ba      .co_unaln_done
4150         nop
4151 
4152 .co_unaln_100:
4153         ldd     [%o4+32], %d8
4154         ldd     [%o4+40], %d10
4155         ldd     [%o4+48], %d12
4156         ldd     [%o4+56], %d14
4157 .co_unaln_100_loop:
4158         add     %o4, 64, %o4
4159         ldda    [%o4]ASI_BLK_P, %d16
4160         faligndata %d8, %d10, %d48
4161         faligndata %d10, %d12, %d50
4162         faligndata %d12, %d14, %d52
4163         faligndata %d14, %d16, %d54
4164         faligndata %d16, %d18, %d56
4165         faligndata %d18, %d20, %d58
4166         faligndata %d20, %d22, %d60
4167         faligndata %d22, %d24, %d62
4168         fmovd   %d24, %d8
4169         fmovd   %d26, %d10
4170         fmovd   %d28, %d12
4171         fmovd   %d30, %d14
4172         stda    %d48, [%i1]ASI_BLK_AIUS
4173         subcc   %i3, 64, %i3
4174         add     %i1, 64, %i1
4175         bgu,pt  %ncc, .co_unaln_100_loop
4176         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4177         ba      .co_unaln_done
4178         nop
4179 
4180 .co_unaln_011:
4181         ldd     [%o4+24], %d6
4182         ldd     [%o4+32], %d8
4183         ldd     [%o4+40], %d10
4184         ldd     [%o4+48], %d12
4185         ldd     [%o4+56], %d14
4186 .co_unaln_011_loop:
4187         add     %o4, 64, %o4
4188         ldda    [%o4]ASI_BLK_P, %d16
4189         faligndata %d6, %d8, %d48
4190         faligndata %d8, %d10, %d50
4191         faligndata %d10, %d12, %d52
4192         faligndata %d12, %d14, %d54
4193         faligndata %d14, %d16, %d56
4194         faligndata %d16, %d18, %d58
4195         faligndata %d18, %d20, %d60
4196         faligndata %d20, %d22, %d62
4197         fmovd   %d22, %d6
4198         fmovd   %d24, %d8
4199         fmovd   %d26, %d10
4200         fmovd   %d28, %d12
4201         fmovd   %d30, %d14
4202         stda    %d48, [%i1]ASI_BLK_AIUS
4203         subcc   %i3, 64, %i3
4204         add     %i1, 64, %i1
4205         bgu,pt  %ncc, .co_unaln_011_loop
4206         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4207         ba      .co_unaln_done
4208         nop
4209 
4210 .co_unaln_010:
4211         ldd     [%o4+16], %d4
4212         ldd     [%o4+24], %d6
4213         ldd     [%o4+32], %d8
4214         ldd     [%o4+40], %d10
4215         ldd     [%o4+48], %d12
4216         ldd     [%o4+56], %d14
4217 .co_unaln_010_loop:
4218         add     %o4, 64, %o4
4219         ldda    [%o4]ASI_BLK_P, %d16
4220         faligndata %d4, %d6, %d48
4221         faligndata %d6, %d8, %d50
4222         faligndata %d8, %d10, %d52
4223         faligndata %d10, %d12, %d54
4224         faligndata %d12, %d14, %d56
4225         faligndata %d14, %d16, %d58
4226         faligndata %d16, %d18, %d60
4227         faligndata %d18, %d20, %d62
4228         fmovd   %d20, %d4
4229         fmovd   %d22, %d6
4230         fmovd   %d24, %d8
4231         fmovd   %d26, %d10
4232         fmovd   %d28, %d12
4233         fmovd   %d30, %d14
4234         stda    %d48, [%i1]ASI_BLK_AIUS
4235         subcc   %i3, 64, %i3
4236         add     %i1, 64, %i1
4237         bgu,pt  %ncc, .co_unaln_010_loop
4238         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4239         ba      .co_unaln_done
4240         nop
4241 
4242 .co_unaln_001:
4243         ldd     [%o4+8], %d2
4244         ldd     [%o4+16], %d4
4245         ldd     [%o4+24], %d6
4246         ldd     [%o4+32], %d8
4247         ldd     [%o4+40], %d10
4248         ldd     [%o4+48], %d12
4249         ldd     [%o4+56], %d14
4250 .co_unaln_001_loop:
4251         add     %o4, 64, %o4
4252         ldda    [%o4]ASI_BLK_P, %d16
4253         faligndata %d2, %d4, %d48
4254         faligndata %d4, %d6, %d50
4255         faligndata %d6, %d8, %d52
4256         faligndata %d8, %d10, %d54
4257         faligndata %d10, %d12, %d56
4258         faligndata %d12, %d14, %d58
4259         faligndata %d14, %d16, %d60
4260         faligndata %d16, %d18, %d62
4261         fmovd   %d18, %d2
4262         fmovd   %d20, %d4
4263         fmovd   %d22, %d6
4264         fmovd   %d24, %d8
4265         fmovd   %d26, %d10
4266         fmovd   %d28, %d12
4267         fmovd   %d30, %d14
4268         stda    %d48, [%i1]ASI_BLK_AIUS
4269         subcc   %i3, 64, %i3
4270         add     %i1, 64, %i1
4271         bgu,pt  %ncc, .co_unaln_001_loop
4272         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4273         ba      .co_unaln_done
4274         nop
4275 
4276 .co_unaln_000:
4277         ldda    [%o4]ASI_BLK_P, %d0
4278 .co_unaln_000_loop:
4279         add     %o4, 64, %o4
4280         ldda    [%o4]ASI_BLK_P, %d16
4281         faligndata %d0, %d2, %d48
4282         faligndata %d2, %d4, %d50
4283         faligndata %d4, %d6, %d52
4284         faligndata %d6, %d8, %d54
4285         faligndata %d8, %d10, %d56
4286         faligndata %d10, %d12, %d58
4287         faligndata %d12, %d14, %d60
4288         faligndata %d14, %d16, %d62
4289         fmovd   %d16, %d0
4290         fmovd   %d18, %d2
4291         fmovd   %d20, %d4
4292         fmovd   %d22, %d6
4293         fmovd   %d24, %d8
4294         fmovd   %d26, %d10
4295         fmovd   %d28, %d12
4296         fmovd   %d30, %d14
4297         stda    %d48, [%i1]ASI_BLK_AIUS
4298         subcc   %i3, 64, %i3
4299         add     %i1, 64, %i1
4300         bgu,pt  %ncc, .co_unaln_000_loop
4301         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4302 
4303 .co_unaln_done:
4304         ! Handle trailing bytes, 64 to 127
4305         ! Dest long word aligned, Src not long word aligned
4306         cmp     %i2, 15
4307         bleu    %ncc, .co_unaln_short
4308 
4309         andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
4310         and     %i2, 0x7, %i2           ! residue bytes in %i2
4311         add     %i2, 8, %i2
4312         sub     %i3, 8, %i3             ! insure we don't load past end of src
4313         andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
4314         add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
4315         ldd     [%o4], %d0              ! fetch partial word
4316 .co_unaln_by8:
4317         ldd     [%o4+8], %d2
4318         add     %o4, 8, %o4
4319         faligndata %d0, %d2, %d16
4320         subcc   %i3, 8, %i3
4321         stda    %d16, [%i1]%asi
4322         fmovd   %d2, %d0
4323         bgu,pt  %ncc, .co_unaln_by8
4324         add     %i1, 8, %i1
4325 
4326 .co_unaln_short:
4327         cmp     %i2, 8
4328         blt,pt  %ncc, .co_unalnfin
4329         nop
4330         ldub    [%i0], %o4
4331         sll     %o4, 24, %o3
4332         ldub    [%i0+1], %o4
4333         sll     %o4, 16, %o4
4334         or      %o4, %o3, %o3
4335         ldub    [%i0+2], %o4
4336         sll     %o4, 8, %o4
4337         or      %o4, %o3, %o3
4338         ldub    [%i0+3], %o4
4339         or      %o4, %o3, %o3
4340         stwa    %o3, [%i1]%asi
4341         ldub    [%i0+4], %o4
4342         sll     %o4, 24, %o3
4343         ldub    [%i0+5], %o4
4344         sll     %o4, 16, %o4
4345         or      %o4, %o3, %o3
4346         ldub    [%i0+6], %o4
4347         sll     %o4, 8, %o4
4348         or      %o4, %o3, %o3
4349         ldub    [%i0+7], %o4
4350         or      %o4, %o3, %o3
4351         stwa    %o3, [%i1+4]%asi
4352         add     %i0, 8, %i0
4353         add     %i1, 8, %i1
4354         sub     %i2, 8, %i2
4355 .co_unalnfin:
4356         cmp     %i2, 4
4357         blt,pt  %ncc, .co_unalnz
4358         tst     %i2
4359         ldub    [%i0], %o3              ! read byte
4360         subcc   %i2, 4, %i2             ! reduce count by 4
4361         sll     %o3, 24, %o3            ! position
4362         ldub    [%i0+1], %o4
4363         sll     %o4, 16, %o4            ! position
4364         or      %o4, %o3, %o3           ! merge
4365         ldub    [%i0+2], %o4
4366         sll     %o4, 8, %o4             ! position
4367         or      %o4, %o3, %o3           ! merge
4368         add     %i1, 4, %i1             ! advance dst by 4
4369         ldub    [%i0+3], %o4
4370         add     %i0, 4, %i0             ! advance src by 4
4371         or      %o4, %o3, %o4           ! merge
4372         bnz,pt  %ncc, .co_unaln3x
4373         stwa    %o4, [%i1-4]%asi
4374         ba      .co_exit
4375         nop
4376 .co_unalnz:
4377         bz,pt   %ncc, .co_exit
4378         wr      %l5, %g0, %gsr          ! restore %gsr
4379 .co_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
4380         subcc   %i2, 1, %i2             ! reduce count for cc test
4381         ldub    [%i0], %o4              ! load one byte
4382         bz,pt   %ncc, .co_exit
4383         stba    %o4, [%i1]%asi          ! store one byte
4384         ldub    [%i0+1], %o4            ! load second byte
4385         subcc   %i2, 1, %i2
4386         bz,pt   %ncc, .co_exit
4387         stba    %o4, [%i1+1]%asi        ! store second byte
4388         ldub    [%i0+2], %o4            ! load third byte
4389         stba    %o4, [%i1+2]%asi        ! store third byte
4390 .co_exit:
4391         brnz    %g1, .co_fp_restore
4392         nop
4393         FZERO
4394         wr      %g1, %g0, %fprs
4395         ba,pt   %ncc, .co_ex2
4396         membar  #Sync
4397 .co_fp_restore:
4398         BLD_FP_FROMSTACK(%o4)
4399 .co_ex2:
4400         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4401         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4402         ret
4403         restore %g0, 0, %o0
4404 
4405 .copyout_err:
4406         ldn     [THREAD_REG + T_COPYOPS], %o4
4407         brz     %o4, 2f
4408         nop
4409         ldn     [%o4 + CP_COPYOUT], %g2
4410         jmp     %g2
4411         nop
4412 2:
4413         retl
4414         mov     -1, %o0
4415 
4416 #else   /* NIAGARA_IMPL */
4417 .do_copyout:
4418         !
4419         ! Check the length and bail if zero.
4420         !
4421         tst     %o2
4422         bnz,pt  %ncc, 1f
4423         nop
4424         retl
4425         clr     %o0
4426 1:
4427         sethi   %hi(copyio_fault), %o4
4428         or      %o4, %lo(copyio_fault), %o4
4429         sethi   %hi(copyio_fault_nowindow), %o3
4430         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
4431         or      %o3, %lo(copyio_fault_nowindow), %o3
4432         membar  #Sync
4433         stn     %o3, [THREAD_REG + T_LOFAULT]
4434 
4435         mov     %o0, SAVE_SRC
4436         mov     %o1, SAVE_DST
4437         mov     %o2, SAVE_COUNT
4438 
4439         !
4440         ! Check to see if we're more than SMALL_LIMIT (7 bytes).
4441         ! Run in leaf mode, using the %o regs as our input regs.
4442         !
4443         subcc   %o2, SMALL_LIMIT, %o3
4444         bgu,a,pt %ncc, .dco_ns
4445         or      %o0, %o1, %o3
4446         !
4447         ! What was previously ".small_copyout"
4448         ! Do full differenced copy.
4449         !
4450 .dcobcp:
4451         sub     %g0, %o2, %o3           ! negate count
4452         add     %o0, %o2, %o0           ! make %o0 point at the end
4453         add     %o1, %o2, %o1           ! make %o1 point at the end
4454         ba,pt   %ncc, .dcocl
4455         ldub    [%o0 + %o3], %o4        ! load first byte
4456         !
4457         ! %o0 and %o2 point at the end and remain pointing at the end
4458         ! of their buffers. We pull things out by adding %o3 (which is
4459         ! the negation of the length) to the buffer end which gives us
4460         ! the curent location in the buffers. By incrementing %o3 we walk
4461         ! through both buffers without having to bump each buffer's
4462         ! pointer. A very fast 4 instruction loop.
4463         !
4464         .align 16
4465 .dcocl:
4466         stba    %o4, [%o1 + %o3]ASI_USER
4467         inccc   %o3
4468         bl,a,pt %ncc, .dcocl
4469         ldub    [%o0 + %o3], %o4
4470         !
4471         ! We're done. Go home.
4472         !
4473         membar  #Sync
4474         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
4475         retl
4476         clr     %o0
4477         !
4478         ! Try aligned copies from here.
4479         !
4480 .dco_ns:
4481         ! %o0 = kernel addr (to be copied from)
4482         ! %o1 = user addr (to be copied to)
4483         ! %o2 = length
4484         ! %o3 = %o1 | %o2 (used for alignment checking)
4485         ! %o4 is alternate lo_fault
4486         ! %o5 is original lo_fault
4487         !
4488         ! See if we're single byte aligned. If we are, check the
4489         ! limit for single byte copies. If we're smaller or equal,
4490         ! bounce to the byte for byte copy loop. Otherwise do it in
4491         ! HW (if enabled).
4492         !
4493         btst    1, %o3
4494         bz,pt   %icc, .dcoh8
4495         btst    7, %o3
4496         !
4497         ! Single byte aligned. Do we do it via HW or via
4498         ! byte for byte? Do a quick no memory reference
4499         ! check to pick up small copies.
4500         !
4501         sethi   %hi(hw_copy_limit_1), %o3
4502         !
4503         ! Big enough that we need to check the HW limit for
4504         ! this size copy.
4505         !
4506         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
4507         !
4508         ! Is HW copy on? If not, do everything byte for byte.
4509         !
4510         tst     %o3
4511         bz,pn   %icc, .dcobcp
4512         subcc   %o3, %o2, %o3
4513         !
4514         ! If we're less than or equal to the single byte copy limit,
4515         ! bop to the copy loop.
4516         !
4517         bge,pt  %ncc, .dcobcp
4518         nop
4519         !
4520         ! We're big enough and copy is on. Do it with HW.
4521         !
4522         ba,pt   %ncc, .big_copyout
4523         nop
4524 .dcoh8:
4525         !
4526         ! 8 byte aligned?
4527         !
4528         bnz,a   %ncc, .dcoh4
4529         btst    3, %o3
4530         !
4531         ! See if we're in the "small range".
4532         ! If so, go off and do the copy.
4533         ! If not, load the hard limit. %o3 is
4534         ! available for reuse.
4535         !
4536         sethi   %hi(hw_copy_limit_8), %o3
4537         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
4538         !
4539         ! If it's zero, there's no HW bcopy.
4540         ! Bop off to the aligned copy.
4541         !
4542         tst     %o3
4543         bz,pn   %icc, .dcos8
4544         subcc   %o3, %o2, %o3
4545         !
4546         ! We're negative if our size is larger than hw_copy_limit_8.
4547         !
4548         bge,pt  %ncc, .dcos8
4549         nop
4550         !
4551         ! HW assist is on and we're large enough. Do it.
4552         !
4553         ba,pt   %ncc, .big_copyout
4554         nop
4555 .dcos8:
4556         !
4557         ! Housekeeping for copy loops. Uses same idea as in the byte for
4558         ! byte copy loop above.
4559         !
4560         add     %o0, %o2, %o0
4561         add     %o1, %o2, %o1
4562         sub     %g0, %o2, %o3
4563         ba,pt   %ncc, .dodebc
4564         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
4565         !
4566         ! 4 byte aligned?
4567         !
4568 .dcoh4:
4569         bnz,pn  %ncc, .dcoh2
4570         !
4571         ! See if we're in the "small range".
4572         ! If so, go off an do the copy.
4573         ! If not, load the hard limit. %o3 is
4574         ! available for reuse.
4575         !
4576         sethi   %hi(hw_copy_limit_4), %o3
4577         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
4578         !
4579         ! If it's zero, there's no HW bcopy.
4580         ! Bop off to the aligned copy.
4581         !
4582         tst     %o3
4583         bz,pn   %icc, .dcos4
4584         subcc   %o3, %o2, %o3
4585         !
4586         ! We're negative if our size is larger than hw_copy_limit_4.
4587         !
4588         bge,pt  %ncc, .dcos4
4589         nop
4590         !
4591         ! HW assist is on and we're large enough. Do it.
4592         !
4593         ba,pt   %ncc, .big_copyout
4594         nop
4595 .dcos4:
4596         add     %o0, %o2, %o0
4597         add     %o1, %o2, %o1
4598         sub     %g0, %o2, %o3
4599         ba,pt   %ncc, .dodfbc
4600         srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
4601         !
4602         ! We must be 2 byte aligned. Off we go.
4603         ! The check for small copies was done in the
4604         ! delay at .dcoh4
4605         !
4606 .dcoh2:
4607         ble     %ncc, .dcos2
4608         sethi   %hi(hw_copy_limit_2), %o3
4609         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
4610         tst     %o3
4611         bz,pn   %icc, .dcos2
4612         subcc   %o3, %o2, %o3
4613         bge,pt  %ncc, .dcos2
4614         nop
4615         !
4616         ! HW is on and we're big enough. Do it.
4617         !
4618         ba,pt   %ncc, .big_copyout
4619         nop
4620 .dcos2:
4621         add     %o0, %o2, %o0
4622         add     %o1, %o2, %o1
4623         sub     %g0, %o2, %o3
4624         ba,pt   %ncc, .dodtbc
4625         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
4626 .small_copyout:
4627         !
4628         ! Why are we doing this AGAIN? There are certain conditions in
4629         ! big_copyout that will cause us to forego the HW assisted copies
4630         ! and bounce back to a non-HW assisted copy. This dispatches those
4631         ! copies. Note that we branch around this in the main line code.
4632         !
4633         ! We make no check for limits or HW enablement here. We've
4634         ! already been told that we're a poster child so just go off
4635         ! and do it.
4636         !
4637         or      %o0, %o1, %o3
4638         btst    1, %o3
4639         bnz     %icc, .dcobcp           ! Most likely
4640         btst    7, %o3
4641         bz      %icc, .dcos8
4642         btst    3, %o3
4643         bz      %icc, .dcos4
4644         nop
4645         ba,pt   %ncc, .dcos2
4646         nop
4647         .align 32
4648 .dodebc:
4649         ldx     [%o0 + %o3], %o4
4650         deccc   %o2
4651         stxa    %o4, [%o1 + %o3]ASI_USER
4652         bg,pt   %ncc, .dodebc
4653         addcc   %o3, 8, %o3
4654         !
4655         ! End of copy loop. Check to see if we're done. Most
4656         ! eight byte aligned copies end here.
4657         !
4658         bz,pt   %ncc, .dcofh
4659         nop
4660         !
4661         ! Something is left - do it byte for byte.
4662         ! 
4663         ba,pt   %ncc, .dcocl
4664         ldub    [%o0 + %o3], %o4        ! load next byte
4665         !
4666         ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
4667         !
4668         .align 32
4669 .dodfbc:
4670         lduw    [%o0 + %o3], %o4
4671         deccc   %o2
4672         sta     %o4, [%o1 + %o3]ASI_USER
4673         bg,pt   %ncc, .dodfbc
4674         addcc   %o3, 4, %o3
4675         !
4676         ! End of copy loop. Check to see if we're done. Most
4677         ! four byte aligned copies end here.
4678         !
4679         bz,pt   %ncc, .dcofh
4680         nop
4681         !
4682         ! Something is left. Do it byte for byte.
4683         !
4684         ba,pt   %ncc, .dcocl
4685         ldub    [%o0 + %o3], %o4        ! load next byte
4686         !
4687         ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
4688         ! copy.
4689         !
4690         .align 32
4691 .dodtbc:
4692         lduh    [%o0 + %o3], %o4
4693         deccc   %o2
4694         stha    %o4, [%o1 + %o3]ASI_USER
4695         bg,pt   %ncc, .dodtbc
4696         addcc   %o3, 2, %o3
4697         !
4698         ! End of copy loop. Anything left?
4699         !
4700         bz,pt   %ncc, .dcofh
4701         nop
4702         !
4703         ! Deal with the last byte
4704         !
4705         ldub    [%o0 + %o3], %o4
4706         stba    %o4, [%o1 + %o3]ASI_USER
4707 .dcofh:
4708         membar  #Sync
4709         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4710         retl
4711         clr     %o0
4712 
4713 .big_copyout:
4714         ! We're going to go off and do a block copy.
4715         ! Switch fault handlers and grab a window. We
4716         ! don't do a membar #Sync since we've done only
4717         ! kernel data to this point.
4718         stn     %o4, [THREAD_REG + T_LOFAULT]
4719 
4720         ! Copy out that reach here are larger than 256 bytes. The
4721         ! hw_copy_limit_1 is set to 256. Never set this limit less
4722         ! 128 bytes.
4723         save    %sp, -SA(MINFRAME), %sp
4724 .do_block_copyout:
4725 
4726         ! Swap src/dst since the code below is memcpy code
4727         ! and memcpy/bcopy have different calling sequences
4728         mov     %i1, %i5
4729         mov     %i0, %i1
4730         mov     %i5, %i0
4731 
4732         ! Block (64 bytes) align the destination.
4733         andcc   %i0, 0x3f, %i3          ! is dst block aligned
4734         bz      %ncc, copyout_blalign   ! dst already block aligned
4735         sub     %i3, 0x40, %i3
4736         neg     %i3                     ! bytes till dst 64 bytes aligned
4737         sub     %i2, %i3, %i2           ! update i2 with new count
4738 
4739         ! Based on source and destination alignment do
4740         ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
4741 
4742         ! Is dst & src 8B aligned
4743         or      %i0, %i1, %o2
4744         andcc   %o2, 0x7, %g0
4745         bz      %ncc, .co_alewdcp
4746         nop
4747 
4748         ! Is dst & src 4B aligned
4749         andcc   %o2, 0x3, %g0
4750         bz      %ncc, .co_alwdcp
4751         nop
4752 
4753         ! Is dst & src 2B aligned
4754         andcc   %o2, 0x1, %g0
4755         bz      %ncc, .co_alhlfwdcp
4756         nop
4757 
4758         ! 1B aligned
4759 1:      ldub    [%i1], %o2
4760         stba    %o2, [%i0]ASI_USER
4761         inc     %i1
4762         deccc   %i3
4763         bgu,pt  %ncc, 1b
4764         inc     %i0
4765 
4766         ba      copyout_blalign
4767         nop
4768 
4769         ! dst & src 4B aligned
4770 .co_alwdcp:
4771         ld      [%i1], %o2
4772         sta     %o2, [%i0]ASI_USER
4773         add     %i1, 0x4, %i1
4774         subcc   %i3, 0x4, %i3
4775         bgu,pt  %ncc, .co_alwdcp
4776         add     %i0, 0x4, %i0
4777 
4778         ba      copyout_blalign
4779         nop
4780 
4781         ! dst & src 2B aligned
4782 .co_alhlfwdcp:
4783         lduh    [%i1], %o2
4784         stuha   %o2, [%i0]ASI_USER
4785         add     %i1, 0x2, %i1
4786         subcc   %i3, 0x2, %i3
4787         bgu,pt  %ncc, .co_alhlfwdcp
4788         add     %i0, 0x2, %i0
4789 
4790         ba      copyout_blalign
4791         nop
4792 
4793         ! dst & src 8B aligned
4794 .co_alewdcp:
4795         ldx     [%i1], %o2
4796         stxa    %o2, [%i0]ASI_USER
4797         add     %i1, 0x8, %i1
4798         subcc   %i3, 0x8, %i3
4799         bgu,pt  %ncc, .co_alewdcp
4800         add     %i0, 0x8, %i0
4801 
4802         ! Now Destination is block (64 bytes) aligned
4803 copyout_blalign:
4804         andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
4805         sub     %i2, %i3, %i2           ! Residue bytes in %i2
4806 
4807         mov     ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4808 
4809         andcc   %i1, 0xf, %o2           ! is src quadword aligned
4810         bz,pn   %xcc, .co_blkcpy        ! src offset in %o2 (last 4-bits)
4811         nop
4812         cmp     %o2, 0x8
4813         bg      .co_upper_double
4814         nop
4815         bl      .co_lower_double
4816         nop
4817 
4818         ! Falls through when source offset is equal to 8 i.e.
4819         ! source is double word aligned.
4820         ! In this case no shift/merge of data is required
4821 
4822         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
4823         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
4824         prefetch [%l0+0x0], #one_read
4825         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4826 .co_loop0:
4827         add     %i1, 0x10, %i1
4828         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4829         prefetch [%l0+0x40], #one_read
4830 
4831         stxa    %l3, [%i0+0x0]%asi
4832         stxa    %l4, [%i0+0x8]%asi
4833 
4834         add     %i1, 0x10, %i1
4835         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4836 
4837         stxa    %l5, [%i0+0x10]%asi
4838         stxa    %l2, [%i0+0x18]%asi
4839 
4840         add     %i1, 0x10, %i1
4841         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4842 
4843         stxa    %l3, [%i0+0x20]%asi
4844         stxa    %l4, [%i0+0x28]%asi
4845 
4846         add     %i1, 0x10, %i1
4847         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4848 
4849         stxa    %l5, [%i0+0x30]%asi
4850         stxa    %l2, [%i0+0x38]%asi
4851 
4852         add     %l0, 0x40, %l0
4853         subcc   %i3, 0x40, %i3
4854         bgu,pt  %xcc, .co_loop0
4855         add     %i0, 0x40, %i0
4856         ba      .co_blkdone
4857         add     %i1, %o2, %i1           ! increment the source by src offset
4858                                         ! the src offset was stored in %o2
4859 
4860 .co_lower_double:
4861 
4862         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
4863         sll     %o2, 3, %o0             ! %o0 left shift
4864         mov     0x40, %o1
4865         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
4866         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
4867         prefetch [%l0+0x0], #one_read
4868         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2    ! partial data in %l2 and %l3 has
4869                                         ! complete data
4870 .co_loop1:
4871         add     %i1, 0x10, %i1
4872         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4    ! %l4 has partial data
4873                                                         ! for this read.
4874         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
4875                                                         ! into %l2 and %l3
4876         prefetch [%l0+0x40], #one_read
4877 
4878         stxa    %l2, [%i0+0x0]%asi
4879         stxa    %l3, [%i0+0x8]%asi
4880 
4881         add     %i1, 0x10, %i1
4882         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4883         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
4884                                                         ! %l4 from previous read
4885                                                         ! into %l4 and %l5
4886         stxa    %l4, [%i0+0x10]%asi
4887         stxa    %l5, [%i0+0x18]%asi
4888 
4889         ! Repeat the same for next 32 bytes.
4890 
4891         add     %i1, 0x10, %i1
4892         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4893         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
4894 
4895         stxa    %l2, [%i0+0x20]%asi
4896         stxa    %l3, [%i0+0x28]%asi
4897 
4898         add     %i1, 0x10, %i1
4899         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4900         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
4901 
4902         stxa    %l4, [%i0+0x30]%asi
4903         stxa    %l5, [%i0+0x38]%asi
4904 
4905         add     %l0, 0x40, %l0
4906         subcc   %i3, 0x40, %i3
4907         bgu,pt  %xcc, .co_loop1
4908         add     %i0, 0x40, %i0
4909         ba      .co_blkdone
4910         add     %i1, %o2, %i1           ! increment the source by src offset
4911                                         ! the src offset was stored in %o2
4912 
4913 .co_upper_double:
4914 
4915         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
4916         sub     %o2, 0x8, %o0
4917         sll     %o0, 3, %o0             ! %o0 left shift
4918         mov     0x40, %o1
4919         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
4920         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
4921         prefetch [%l0+0x0], #one_read
4922         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2    ! partial data in %l3
4923                                                         ! for this read and
4924                                                         ! no data in %l2
4925 .co_loop2:
4926         add     %i1, 0x10, %i1
4927         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4    ! %l4 has complete data
4928                                                         ! and %l5 has partial
4929         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
4930                                                         ! into %l3 and %l4
4931         prefetch [%l0+0x40], #one_read
4932 
4933         stxa    %l3, [%i0+0x0]%asi
4934         stxa    %l4, [%i0+0x8]%asi
4935 
4936         add     %i1, 0x10, %i1
4937         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4938         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
4939                                                         ! %l5 from previous read
4940                                                         ! into %l5 and %l2
4941 
4942         stxa    %l5, [%i0+0x10]%asi
4943         stxa    %l2, [%i0+0x18]%asi
4944 
4945         ! Repeat the same for next 32 bytes.
4946 
4947         add     %i1, 0x10, %i1
4948         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4949         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
4950 
4951         stxa    %l3, [%i0+0x20]%asi
4952         stxa    %l4, [%i0+0x28]%asi
4953 
4954         add     %i1, 0x10, %i1
4955         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4956         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
4957 
4958         stxa    %l5, [%i0+0x30]%asi
4959         stxa    %l2, [%i0+0x38]%asi
4960 
4961         add     %l0, 0x40, %l0
4962         subcc   %i3, 0x40, %i3
4963         bgu,pt  %xcc, .co_loop2
4964         add     %i0, 0x40, %i0
4965         ba      .co_blkdone
4966         add     %i1, %o2, %i1           ! increment the source by src offset
4967                                         ! the src offset was stored in %o2
4968 
4969 
4970         ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
4971 .co_blkcpy:
4972 
4973         andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
4974         prefetch [%o0+0x0], #one_read
4975 1:
4976         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
4977         add     %i1, 0x10, %i1
4978         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4979         add     %i1, 0x10, %i1
4980 
4981         prefetch [%o0+0x40], #one_read
4982 
4983         stxa    %l0, [%i0+0x0]%asi
4984 
4985         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4986         add     %i1, 0x10, %i1
4987         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
4988         add     %i1, 0x10, %i1
4989 
4990         stxa    %l1, [%i0+0x8]%asi
4991         stxa    %l2, [%i0+0x10]%asi
4992         stxa    %l3, [%i0+0x18]%asi
4993         stxa    %l4, [%i0+0x20]%asi
4994         stxa    %l5, [%i0+0x28]%asi
4995         stxa    %l6, [%i0+0x30]%asi
4996         stxa    %l7, [%i0+0x38]%asi
4997 
4998         add     %o0, 0x40, %o0
4999         subcc   %i3, 0x40, %i3
5000         bgu,pt  %xcc, 1b
5001         add     %i0, 0x40, %i0
5002 
5003 .co_blkdone:
5004         membar  #Sync
5005 
5006         brz,pt  %i2, .copyout_exit
5007         nop
5008 
5009         ! Handle trailing bytes
5010         cmp     %i2, 0x8
5011         blu,pt  %ncc, .co_residue
5012         nop
5013 
5014         ! Can we do some 8B ops
5015         or      %i1, %i0, %o2
5016         andcc   %o2, 0x7, %g0
5017         bnz     %ncc, .co_last4
5018         nop
5019 
5020         ! Do 8byte ops as long as possible
5021 .co_last8:
5022         ldx     [%i1], %o2
5023         stxa    %o2, [%i0]ASI_USER
5024         add     %i1, 0x8, %i1
5025         sub     %i2, 0x8, %i2
5026         cmp     %i2, 0x8
5027         bgu,pt  %ncc, .co_last8
5028         add     %i0, 0x8, %i0
5029 
5030         brz,pt  %i2, .copyout_exit
5031         nop
5032 
5033         ba      .co_residue
5034         nop
5035 
5036 .co_last4:
5037         ! Can we do 4B ops
5038         andcc   %o2, 0x3, %g0
5039         bnz     %ncc, .co_last2
5040         nop
5041 1:
5042         ld      [%i1], %o2
5043         sta     %o2, [%i0]ASI_USER
5044         add     %i1, 0x4, %i1
5045         sub     %i2, 0x4, %i2
5046         cmp     %i2, 0x4
5047         bgu,pt  %ncc, 1b
5048         add     %i0, 0x4, %i0
5049 
5050         brz,pt  %i2, .copyout_exit
5051         nop
5052 
5053         ba      .co_residue
5054         nop
5055 
5056 .co_last2:
5057         ! Can we do 2B ops
5058         andcc   %o2, 0x1, %g0
5059         bnz     %ncc, .co_residue
5060         nop
5061 
5062 1:
5063         lduh    [%i1], %o2
5064         stuha   %o2, [%i0]ASI_USER
5065         add     %i1, 0x2, %i1
5066         sub     %i2, 0x2, %i2
5067         cmp     %i2, 0x2
5068         bgu,pt  %ncc, 1b
5069         add     %i0, 0x2, %i0
5070 
5071         brz,pt  %i2, .copyout_exit
5072         nop
5073 
5074         ! Copy the residue as byte copy
5075 .co_residue:
5076         ldub    [%i1], %i4
5077         stba    %i4, [%i0]ASI_USER
5078         inc     %i1
5079         deccc   %i2
5080         bgu,pt  %xcc, .co_residue
5081         inc     %i0
5082 
5083 .copyout_exit:
5084         membar  #Sync
5085         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
5086         ret
5087         restore %g0, 0, %o0
5088 
5089 .copyout_err:
5090         ldn     [THREAD_REG + T_COPYOPS], %o4
5091         brz     %o4, 2f
5092         nop
5093         ldn     [%o4 + CP_COPYOUT], %g2
5094         jmp     %g2
5095         nop
5096 2:
5097         retl
5098         mov     -1, %o0
5099 #endif  /* NIAGARA_IMPL */
5100         SET_SIZE(copyout)
5101 
5102 #endif  /* lint */
5103 
5104 
5105 #ifdef  lint
5106 
5107 /*ARGSUSED*/
5108 int
5109 xcopyout(const void *kaddr, void *uaddr, size_t count)
5110 { return (0); }
5111 
5112 #else   /* lint */
5113 
5114         ENTRY(xcopyout)
5115         sethi   %hi(.xcopyout_err), REAL_LOFAULT
5116         b       .do_copyout
5117         or      REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
5118 .xcopyout_err:
5119         ldn     [THREAD_REG + T_COPYOPS], %o4
5120         brz     %o4, 2f
5121         nop
5122         ldn     [%o4 + CP_XCOPYOUT], %g2
5123         jmp     %g2
5124         nop
5125 2:
5126         retl
5127         mov     %g1, %o0
5128         SET_SIZE(xcopyout)
5129 
5130 #endif  /* lint */
5131         
5132 #ifdef  lint
5133 
5134 /*ARGSUSED*/
5135 int
5136 xcopyout_little(const void *kaddr, void *uaddr, size_t count)
5137 { return (0); }
5138 
5139 #else   /* lint */
5140 
5141         ENTRY(xcopyout_little)
5142         sethi   %hi(.little_err), %o4
5143         ldn     [THREAD_REG + T_LOFAULT], %o5
5144         or      %o4, %lo(.little_err), %o4
5145         membar  #Sync                   ! sync error barrier
5146         stn     %o4, [THREAD_REG + T_LOFAULT]
5147 
5148         subcc   %g0, %o2, %o3
5149         add     %o0, %o2, %o0
5150         bz,pn   %ncc, 2f                ! check for zero bytes
5151         sub     %o2, 1, %o4
5152         add     %o0, %o4, %o0           ! start w/last byte
5153         add     %o1, %o2, %o1
5154         ldub    [%o0+%o3], %o4
5155 
5156 1:      stba    %o4, [%o1+%o3]ASI_AIUSL
5157         inccc   %o3
5158         sub     %o0, 2, %o0             ! get next byte
5159         bcc,a,pt %ncc, 1b
5160         ldub    [%o0+%o3], %o4
5161 
5162 2:      membar  #Sync                   ! sync error barrier
5163         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
5164         retl
5165         mov     %g0, %o0                ! return (0)
5166         SET_SIZE(xcopyout_little)
5167 
5168 #endif  /* lint */
5169 
5170 /*
5171  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
5172  */
5173 
5174 #if defined(lint)
5175 
5176 /*ARGSUSED*/
5177 int
5178 copyin(const void *uaddr, void *kaddr, size_t count)
5179 { return (0); }
5180 
5181 #else   /* lint */
5182 
5183         ENTRY(copyin)
5184         sethi   %hi(.copyin_err), REAL_LOFAULT
5185         or      REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
5186 
5187 #if !defined(NIAGARA_IMPL)
5188 .do_copyin:
5189         tst     %o2                     ! check for zero count;  quick exit
5190         bz,pt   %ncc, .ci_smallqx
5191         mov     %o0, SAVE_SRC
5192         mov     %o1, SAVE_DST
5193         mov     %o2, SAVE_COUNT
5194         cmp     %o2, FP_COPY            ! check for small copy/leaf case
5195         bgt,pt  %ncc, .ci_copy_more
5196         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
5197 /*
5198  * Small copy in code
5199  * 
5200  */
5201         sethi   %hi(copyio_fault_nowindow), %o3
5202         or      %o3, %lo(copyio_fault_nowindow), %o3
5203         membar  #Sync
5204         stn     %o3, [THREAD_REG + T_LOFAULT]
5205 
5206         mov     ASI_USER, %asi
5207         cmp     %o2, SHORTCOPY          ! make sure there is enough to align
5208         ble,pt  %ncc, .ci_smallest
5209         andcc   %o1, 0x7, %o3           ! is dest long word aligned
5210         bnz,pn  %ncc, .ci_align
5211         andcc   %o1, 1, %o3             ! is dest byte aligned
5212 
5213 ! Destination is long word aligned
5214 .ci_al_src:
5215         andcc   %o0, 7, %o3
5216         brnz,pt %o3, .ci_src_dst_unal8
5217         nop
5218 /*
5219  * Special case for handling when src and dest are both long word aligned
5220  * and total data to move is less than FP_COPY bytes
5221  * Also handles finish up for large block moves, so may be less than 32 bytes
5222  */
5223 .ci_medlong:
5224         subcc   %o2, 31, %o2            ! adjust length to allow cc test
5225         ble,pt  %ncc, .ci_medl31
5226         nop
5227 .ci_medl32:
5228         ldxa    [%o0]%asi, %o4          ! move 32 bytes
5229         subcc   %o2, 32, %o2            ! decrement length count by 32
5230         stx     %o4, [%o1]
5231         ldxa    [%o0+8]%asi, %o4
5232         stx     %o4, [%o1+8]
5233         ldxa    [%o0+16]%asi, %o4
5234         add     %o0, 32, %o0            ! increase src ptr by 32
5235         stx     %o4, [%o1+16]
5236         ldxa    [%o0-8]%asi, %o4
5237         add     %o1, 32, %o1            ! increase dst ptr by 32
5238         bgu,pt  %ncc, .ci_medl32        ! repeat if at least 32 bytes left
5239         stx     %o4, [%o1-8]
5240 .ci_medl31:
5241         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5242         ble,pt  %ncc, .ci_medl7         ! skip if 7 or fewer bytes left
5243         nop
5244 .ci_medl8:
5245         ldxa    [%o0]%asi, %o4          ! move 8 bytes
5246         add     %o0, 8, %o0             ! increase src ptr by 8
5247         subcc   %o2, 8, %o2             ! decrease count by 8
5248         add     %o1, 8, %o1             ! increase dst ptr by 8
5249         bgu,pt  %ncc, .ci_medl8
5250         stx     %o4, [%o1-8]
5251 .ci_medl7:
5252         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
5253         bnz,pt  %ncc, .ci_small4        ! do final bytes if not finished
5254         nop
5255 .ci_smallx:                             ! finish up and exit
5256         membar  #Sync
5257         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5258 .ci_smallqx:
5259         retl
5260         mov     %g0, %o0
5261 
5262 .ci_small4:
5263         cmp     %o2, 4
5264         blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
5265         nop                             !
5266         lda     [%o0]%asi, %o4          ! move 4 bytes
5267         add     %o0, 4, %o0             ! increase src ptr by 4
5268         add     %o1, 4, %o1             ! increase dst ptr by 4
5269         subcc   %o2, 4, %o2             ! decrease count by 4
5270         bz      %ncc, .ci_smallx
5271         stw     %o4, [%o1-4]
5272 
5273 .ci_small3x:                            ! Exactly 1, 2, or 3 bytes remain
5274         subcc   %o2, 1, %o2             ! reduce count for cc test
5275         lduba   [%o0]%asi, %o4          ! load one byte
5276         bz,pt   %ncc, .ci_smallx
5277         stb     %o4, [%o1]              ! store one byte
5278         lduba   [%o0+1]%asi, %o4        ! load second byte
5279         subcc   %o2, 1, %o2
5280         bz,pt   %ncc, .ci_smallx
5281         stb     %o4, [%o1+1]            ! store second byte
5282         lduba   [%o0+2]%asi, %o4        ! load third byte
5283         ba      .ci_smallx
5284         stb     %o4, [%o1+2]            ! store third byte
5285 
5286 .ci_smallest:                           ! 7 or fewer bytes remain
5287         cmp     %o2, 4
5288         blt,pt  %ncc, .ci_small3x
5289         nop
5290         lduba   [%o0]%asi, %o4          ! read byte
5291         subcc   %o2, 4, %o2             ! reduce count by 4
5292         stb     %o4, [%o1]              ! write byte
5293         lduba   [%o0+1]%asi, %o4        ! repeat for total of 4 bytes
5294         add     %o0, 4, %o0             ! advance src by 4
5295         stb     %o4, [%o1+1]
5296         lduba   [%o0-2]%asi, %o4
5297         add     %o1, 4, %o1             ! advance dst by 4
5298         stb     %o4, [%o1-2]
5299         lduba   [%o0-1]%asi, %o4
5300         bnz,pt  %ncc, .ci_small3x
5301         stb     %o4, [%o1-1]
5302         membar  #Sync
5303         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5304         retl
5305         mov     %g0, %o0
5306 
5307 .ci_align:
5308         bnz,pt  %ncc, .ci_al_d1
5309 .ci_al_d1f:                             ! dest is now half word aligned
5310         andcc   %o1, 2, %o3             ! is dest word aligned
5311         bnz,pt  %ncc, .ci_al_d2
5312 .ci_al_d2f:                             ! dest is now word aligned
5313         andcc   %o1, 4, %o3             ! is dest longword aligned?
5314         bz,pt   %ncc, .ci_al_src
5315         nop
5316 .ci_al_d4:                              ! dest is word aligned;  src is unknown
5317         lduba   [%o0]%asi, %o4          ! move a word (src align unknown)
5318         lduba   [%o0+1]%asi, %o3
5319         sll     %o4, 24, %o4            ! position
5320         sll     %o3, 16, %o3            ! position
5321         or      %o4, %o3, %o3           ! merge
5322         lduba   [%o0+2]%asi, %o4
5323         sll     %o4, 8, %o4             ! position
5324         or      %o4, %o3, %o3           ! merge
5325         lduba   [%o0+3]%asi, %o4
5326         or      %o4, %o3, %o4           ! merge
5327         stw     %o4,[%o1]               ! store four bytes
5328         add     %o0, 4, %o0             ! adjust src by 4
5329         add     %o1, 4, %o1             ! adjust dest by 4
5330         sub     %o2, 4, %o2             ! adjust count by 4
5331         andcc   %o0, 7, %o3             ! check for src long word alignment
5332         brz,pt  %o3, .ci_medlong
5333 .ci_src_dst_unal8:
5334         ! dst is 8-byte aligned, src is not
5335         ! Size is less than FP_COPY
5336         ! Following code is to select for alignment
5337         andcc   %o0, 0x3, %o3           ! test word alignment
5338         bz,pt   %ncc, .ci_medword
5339         nop
5340         andcc   %o0, 0x1, %o3           ! test halfword alignment
5341         bnz,pt  %ncc, .ci_med_byte      ! go to byte move if not halfword
5342         andcc   %o0, 0x2, %o3           ! test which byte alignment
5343         ba      .ci_medhalf
5344         nop
5345 .ci_al_d1:                              ! align dest to half word
5346         lduba   [%o0]%asi, %o4          ! move a byte
5347         add     %o0, 1, %o0
5348         stb     %o4, [%o1]
5349         add     %o1, 1, %o1
5350         andcc   %o1, 2, %o3             ! is dest word aligned
5351         bz,pt   %ncc, .ci_al_d2f
5352         sub     %o2, 1, %o2
5353 .ci_al_d2:                              ! align dest to word
5354         lduba   [%o0]%asi, %o4          ! move a half-word (src align unknown)
5355         lduba   [%o0+1]%asi, %o3
5356         sll     %o4, 8, %o4             ! position
5357         or      %o4, %o3, %o4           ! merge
5358         sth     %o4, [%o1]
5359         add     %o0, 2, %o0
5360         add     %o1, 2, %o1
5361         andcc   %o1, 4, %o3             ! is dest longword aligned?
5362         bz,pt   %ncc, .ci_al_src
5363         sub     %o2, 2, %o2
5364         ba      .ci_al_d4
5365         nop
5366 /*
5367  * Handle all cases where src and dest are aligned on word
5368  * boundaries. Use unrolled loops for better performance.
5369  * This option wins over standard large data move when 
5370  * source and destination is in cache for medium
5371  * to short data moves.
5372  */
5373 .ci_medword:
5374         subcc   %o2, 31, %o2            ! adjust length to allow cc test
5375         ble,pt  %ncc, .ci_medw31
5376         nop
5377 .ci_medw32:
5378         lda     [%o0]%asi, %o4          ! move a block of 32 bytes
5379         stw     %o4, [%o1]
5380         lda     [%o0+4]%asi, %o4
5381         stw     %o4, [%o1+4]
5382         lda     [%o0+8]%asi, %o4
5383         stw     %o4, [%o1+8]
5384         lda     [%o0+12]%asi, %o4
5385         stw     %o4, [%o1+12]
5386         lda     [%o0+16]%asi, %o4
5387         stw     %o4, [%o1+16]
5388         lda     [%o0+20]%asi, %o4
5389         subcc   %o2, 32, %o2            ! decrement length count
5390         stw     %o4, [%o1+20]
5391         lda     [%o0+24]%asi, %o4
5392         add     %o0, 32, %o0            ! increase src ptr by 32
5393         stw     %o4, [%o1+24]
5394         lda     [%o0-4]%asi, %o4
5395         add     %o1, 32, %o1            ! increase dst ptr by 32
5396         bgu,pt  %ncc, .ci_medw32        ! repeat if at least 32 bytes left
5397         stw     %o4, [%o1-4]
5398 .ci_medw31:
5399         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5400         ble,pt  %ncc, .ci_medw7         ! skip if 7 or fewer bytes left
5401         nop                             !
5402 .ci_medw15:
5403         lda     [%o0]%asi, %o4          ! move a block of 8 bytes
5404         subcc   %o2, 8, %o2             ! decrement length count
5405         stw     %o4, [%o1]
5406         add     %o0, 8, %o0             ! increase src ptr by 8
5407         lda     [%o0-4]%asi, %o4
5408         add     %o1, 8, %o1             ! increase dst ptr by 8
5409         bgu,pt  %ncc, .ci_medw15
5410         stw     %o4, [%o1-4]
5411 .ci_medw7:
5412         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
5413         bz,pt   %ncc, .ci_smallx        ! exit if finished
5414         cmp     %o2, 4
5415         blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
5416         nop                             !
5417         lda     [%o0]%asi, %o4          ! move 4 bytes
5418         add     %o0, 4, %o0             ! increase src ptr by 4
5419         add     %o1, 4, %o1             ! increase dst ptr by 4
5420         subcc   %o2, 4, %o2             ! decrease count by 4
5421         bnz     .ci_small3x
5422         stw     %o4, [%o1-4]
5423         membar  #Sync
5424         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5425         retl
5426         mov     %g0, %o0
5427 
5428 .ci_medhalf:
5429         subcc   %o2, 31, %o2            ! adjust length to allow cc test
5430         ble,pt  %ncc, .ci_medh31
5431         nop
5432 .ci_medh32:                             ! load and store block of 32 bytes
5433         subcc   %o2, 32, %o2            ! decrement length count
5434 
5435         lduha   [%o0]%asi, %o4          ! move 32 bytes
5436         lduwa   [%o0+2]%asi, %o3
5437         sllx    %o4, 48, %o4
5438         sllx    %o3, 16, %o3
5439         or      %o4, %o3, %o3
5440         lduha   [%o0+6]%asi, %o4
5441         or      %o4, %o3, %o4
5442         stx     %o4, [%o1]
5443 
5444         lduha   [%o0+8]%asi, %o4
5445         lduwa   [%o0+10]%asi, %o3
5446         sllx    %o4, 48, %o4
5447         sllx    %o3, 16, %o3
5448         or      %o4, %o3, %o3
5449         lduha   [%o0+14]%asi, %o4
5450         or      %o4, %o3, %o4
5451         stx     %o4, [%o1+8]
5452 
5453         lduha   [%o0+16]%asi, %o4
5454         lduwa   [%o0+18]%asi, %o3
5455         sllx    %o4, 48, %o4
5456         sllx    %o3, 16, %o3
5457         or      %o4, %o3, %o3
5458         lduha   [%o0+22]%asi, %o4
5459         or      %o4, %o3, %o4
5460         stx     %o4, [%o1+16]
5461 
5462         add     %o0, 32, %o0            ! increase src ptr by 32
5463         add     %o1, 32, %o1            ! increase dst ptr by 32
5464 
5465         lduha   [%o0-8]%asi, %o4
5466         lduwa   [%o0-6]%asi, %o3
5467         sllx    %o4, 48, %o4
5468         sllx    %o3, 16, %o3
5469         or      %o4, %o3, %o3
5470         lduha   [%o0-2]%asi, %o4
5471         or      %o3, %o4, %o4
5472         bgu,pt  %ncc, .ci_medh32        ! repeat if at least 32 bytes left
5473         stx     %o4, [%o1-8]
5474 
5475 .ci_medh31:
5476         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5477         ble,pt  %ncc, .ci_medh7         ! skip if 7 or fewer bytes left
5478         nop                             !
5479 .ci_medh15:
5480         lduha   [%o0]%asi, %o4          ! move 16 bytes
5481         subcc   %o2, 8, %o2             ! decrement length count
5482         lduwa   [%o0+2]%asi, %o3
5483         sllx    %o4, 48, %o4
5484         sllx    %o3, 16, %o3
5485         or      %o4, %o3, %o3
5486         add     %o1, 8, %o1             ! increase dst ptr by 8
5487         lduha   [%o0+6]%asi, %o4
5488         add     %o0, 8, %o0             ! increase src ptr by 8
5489         or      %o4, %o3, %o4
5490         bgu,pt  %ncc, .ci_medh15
5491         stx     %o4, [%o1-8]
5492 .ci_medh7:
5493         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
5494         bz,pt   %ncc, .ci_smallx        ! exit if finished
5495         cmp     %o2, 4
5496         blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
5497         nop                             !
5498         lduha   [%o0]%asi, %o4
5499         sll     %o4, 16, %o4
5500         lduha   [%o0+2]%asi, %o3
5501         or      %o3, %o4, %o4
5502         subcc   %o2, 4, %o2
5503         add     %o0, 4, %o0
5504         add     %o1, 4, %o1
5505         bnz     .ci_small3x
5506         stw     %o4, [%o1-4]
5507         membar  #Sync
5508         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5509         retl
5510         mov     %g0, %o0
5511 
5512         .align 16
5513 .ci_med_byte:
5514         bnz,pt  %ncc, .ci_medbh32a      ! go to correct byte move
5515         subcc   %o2, 31, %o2            ! adjust length to allow cc test
5516         ble,pt  %ncc, .ci_medb31
5517         nop
5518 .ci_medb32:                             ! Alignment 1 or 5
5519         subcc   %o2, 32, %o2            ! decrement length count
5520 
5521         lduba   [%o0]%asi, %o4          ! load and store a block of 32 bytes
5522         sllx    %o4, 56, %o3
5523         lduha   [%o0+1]%asi, %o4
5524         sllx    %o4, 40, %o4
5525         or      %o4, %o3, %o3
5526         lduwa   [%o0+3]%asi, %o4
5527         sllx    %o4, 8, %o4
5528         or      %o4, %o3, %o3
5529         lduba   [%o0+7]%asi, %o4
5530         or      %o4, %o3, %o4
5531         stx     %o4, [%o1]
5532 
5533         lduba   [%o0+8]%asi, %o4
5534         sllx    %o4, 56, %o3
5535         lduha   [%o0+9]%asi, %o4
5536         sllx    %o4, 40, %o4
5537         or      %o4, %o3, %o3
5538         lduwa   [%o0+11]%asi, %o4
5539         sllx    %o4, 8, %o4
5540         or      %o4, %o3, %o3
5541         lduba   [%o0+15]%asi, %o4
5542         or      %o4, %o3, %o4
5543         stx     %o4, [%o1+8]
5544 
5545         lduba   [%o0+16]%asi, %o4
5546         sllx    %o4, 56, %o3
5547         lduha   [%o0+17]%asi, %o4
5548         sllx    %o4, 40, %o4
5549         or      %o4, %o3, %o3
5550         lduwa   [%o0+19]%asi, %o4
5551         sllx    %o4, 8, %o4
5552         or      %o4, %o3, %o3
5553         lduba   [%o0+23]%asi, %o4
5554         or      %o4, %o3, %o4
5555         stx     %o4, [%o1+16]
5556 
5557         add     %o0, 32, %o0            ! increase src ptr by 32
5558         add     %o1, 32, %o1            ! increase dst ptr by 32
5559 
5560         lduba   [%o0-8]%asi, %o4
5561         sllx    %o4, 56, %o3
5562         lduha   [%o0-7]%asi, %o4
5563         sllx    %o4, 40, %o4
5564         or      %o4, %o3, %o3
5565         lduwa   [%o0-5]%asi, %o4
5566         sllx    %o4, 8, %o4
5567         or      %o4, %o3, %o3
5568         lduba   [%o0-1]%asi, %o4
5569         or      %o4, %o3, %o4
5570         bgu,pt  %ncc, .ci_medb32        ! repeat if at least 32 bytes left
5571         stx     %o4, [%o1-8]
5572 
5573 .ci_medb31:                             ! 31 or fewer bytes remaining
5574         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5575         ble,pt  %ncc, .ci_medb7         ! skip if 7 or fewer bytes left
5576         nop                             !
5577 .ci_medb15:
5578 
5579         lduba   [%o0]%asi, %o4          ! load and store a block of 8 bytes
5580         subcc   %o2, 8, %o2             ! decrement length count
5581         sllx    %o4, 56, %o3
5582         lduha   [%o0+1]%asi, %o4
5583         sllx    %o4, 40, %o4
5584         or      %o4, %o3, %o3
5585         lduwa   [%o0+3]%asi, %o4
5586         add     %o1, 8, %o1             ! increase dst ptr by 16
5587         sllx    %o4, 8, %o4
5588         or      %o4, %o3, %o3
5589         lduba   [%o0+7]%asi, %o4
5590         add     %o0, 8, %o0             ! increase src ptr by 16
5591         or      %o4, %o3, %o4
5592         bgu,pt  %ncc, .ci_medb15
5593         stx     %o4, [%o1-8]
5594 .ci_medb7:
5595         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
5596         bz,pt   %ncc, .ci_smallx        ! exit if finished
5597         cmp     %o2, 4
5598         blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
5599         nop                             !
5600         lduba   [%o0]%asi, %o4          ! move 4 bytes
5601         sll     %o4, 24, %o3
5602         lduha   [%o0+1]%asi, %o4
5603         sll     %o4, 8, %o4
5604         or      %o4, %o3, %o3
5605         lduba   [%o0+3]%asi, %o4
5606         or      %o4, %o3, %o4
5607         subcc   %o2, 4, %o2
5608         add     %o0, 4, %o0
5609         add     %o1, 4, %o1
5610         bnz     .ci_small3x
5611         stw     %o4, [%o1-4]
5612         membar  #Sync
5613         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5614         retl
5615         mov     %g0, %o0
5616 
5617         .align 16
5618 .ci_medbh32a:                           ! Alignment 3 or 7
5619         ble,pt  %ncc, .ci_medbh31
5620         nop
5621 .ci_medbh32:                            ! Alignment 3 or 7
5622         subcc   %o2, 32, %o2            ! decrement length count
5623 
5624         lduba   [%o0]%asi, %o4          ! load and store a block of 32 bytes
5625         sllx    %o4, 56, %o3
5626         lduwa   [%o0+1]%asi, %o4
5627         sllx    %o4, 24, %o4
5628         or      %o4, %o3, %o3
5629         lduha   [%o0+5]%asi, %o4
5630         sllx    %o4, 8, %o4
5631         or      %o4, %o3, %o3
5632         lduba   [%o0+7]%asi, %o4
5633         or      %o4, %o3, %o4
5634         stx     %o4, [%o1]
5635 
5636         lduba   [%o0+8]%asi, %o4
5637         sllx    %o4, 56, %o3
5638         lduwa   [%o0+9]%asi, %o4
5639         sllx    %o4, 24, %o4
5640         or      %o4, %o3, %o3
5641         lduha   [%o0+13]%asi, %o4
5642         sllx    %o4, 8, %o4
5643         or      %o4, %o3, %o3
5644         lduba   [%o0+15]%asi, %o4
5645         or      %o4, %o3, %o4
5646         stx     %o4, [%o1+8]
5647 
5648         lduba   [%o0+16]%asi, %o4
5649         sllx    %o4, 56, %o3
5650         lduwa   [%o0+17]%asi, %o4
5651         sllx    %o4, 24, %o4
5652         or      %o4, %o3, %o3
5653         lduha   [%o0+21]%asi, %o4
5654         sllx    %o4, 8, %o4
5655         or      %o4, %o3, %o3
5656         lduba   [%o0+23]%asi, %o4
5657         or      %o4, %o3, %o4
5658         stx     %o4, [%o1+16]
5659 
5660         add     %o0, 32, %o0            ! increase src ptr by 32
5661         add     %o1, 32, %o1            ! increase dst ptr by 32
5662 
5663         lduba   [%o0-8]%asi, %o4
5664         sllx    %o4, 56, %o3
5665         lduwa   [%o0-7]%asi, %o4
5666         sllx    %o4, 24, %o4
5667         or      %o4, %o3, %o3
5668         lduha   [%o0-3]%asi, %o4
5669         sllx    %o4, 8, %o4
5670         or      %o4, %o3, %o3
5671         lduba   [%o0-1]%asi, %o4
5672         or      %o4, %o3, %o4
5673         bgu,pt  %ncc, .ci_medbh32       ! repeat if at least 32 bytes left
5674         stx     %o4, [%o1-8]
5675 
5676 .ci_medbh31:
5677         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5678         ble,pt  %ncc, .ci_medb7         ! skip if 7 or fewer bytes left
5679         nop                             !
5680 .ci_medbh15:
5681         lduba   [%o0]%asi, %o4          ! load and store a block of 8 bytes
5682         sllx    %o4, 56, %o3
5683         lduwa   [%o0+1]%asi, %o4
5684         sllx    %o4, 24, %o4
5685         or      %o4, %o3, %o3
5686         lduha   [%o0+5]%asi, %o4
5687         sllx    %o4, 8, %o4
5688         or      %o4, %o3, %o3
5689         lduba   [%o0+7]%asi, %o4
5690         or      %o4, %o3, %o4
5691         stx     %o4, [%o1]
5692         subcc   %o2, 8, %o2             ! decrement length count
5693         add     %o1, 8, %o1             ! increase dst ptr by 8
5694         add     %o0, 8, %o0             ! increase src ptr by 8
5695         bgu,pt  %ncc, .ci_medbh15
5696         stx     %o4, [%o1-8]
5697         ba      .ci_medb7
5698         nop
5699 
5700 /*
5701  * End of small copy in code (no window)
5702  * 
5703  */
5704 
5705 /*
5706  * Long copy in code (using register window and fp regs)
5707  * 
5708  */
5709 
5710 .ci_copy_more:
5711         sethi   %hi(copyio_fault), %o3
5712         or      %o3, %lo(copyio_fault), %o3
5713         membar  #Sync
5714         stn     %o3, [THREAD_REG + T_LOFAULT]
5715 /*
5716  * Following code is for large copies. We know there is at
5717  * least FP_COPY bytes available. FP regs are used, so
5718  *  we save registers and fp regs before starting
5719  */
5720         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
5721         or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
5722         rd      %fprs, %g1              ! check for unused fp
5723         ! if fprs.fef == 0, set it.
5724         ! Setting it when already set costs more than checking
5725         andcc   %g1, FPRS_FEF, %g1      ! test FEF, fprs.du = fprs.dl = 0
5726         bz,pt   %ncc, .ci_fp_unused
5727         mov     ASI_USER, %asi
5728         BST_FP_TOSTACK(%o3)
5729         ba      .ci_fp_ready
5730 .ci_fp_unused:
5731         prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
5732         wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
5733 .ci_fp_ready:
5734         rd      %gsr, %l5               ! save %gsr value
5735         andcc   %i1, 1, %o3             ! is dest byte aligned
5736         bnz,pt  %ncc, .ci_big_d1
5737 .ci_big_d1f:                            ! dest is now half word aligned
5738         andcc   %i1, 2, %o3
5739         bnz,pt  %ncc, .ci_big_d2
5740 .ci_big_d2f:                            ! dest is now word aligned
5741         andcc   %i1, 4, %o3
5742         bnz,pt  %ncc, .ci_big_d4
5743 .ci_big_d4f:                            ! dest is long word aligned
5744         andcc   %i0, 7, %o3             ! is src long word aligned
5745         brnz,pt %o3, .ci_big_unal8
5746         prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
5747         ! Src and dst are long word aligned
5748         ! align dst to 64 byte boundary
5749         andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
5750         brz,pn  %o3, .ci_al_to_64
5751         nop
5752         sub     %o3, 64, %o3            ! %o3 has negative bytes to move
5753         add     %i2, %o3, %i2           ! adjust remaining count
5754         andcc   %o3, 8, %o4             ! odd long words to move?
5755         brz,pt  %o4, .ci_al_to_16
5756         nop
5757         add     %o3, 8, %o3
5758         ldxa    [%i0]%asi, %o4
5759         add     %i0, 8, %i0             ! increment src ptr
5760         add     %i1, 8, %i1             ! increment dst ptr
5761         stx     %o4, [%i1-8]
5762 ! Dest is aligned on 16 bytes, src 8 byte aligned
5763 .ci_al_to_16:
5764         andcc   %o3, 0x30, %o4          ! pair of long words to move?
5765         brz,pt  %o4, .ci_al_to_64
5766         nop
5767 .ci_al_mv_16:
5768         add     %o3, 16, %o3
5769         ldxa    [%i0]%asi, %o4
5770         stx     %o4, [%i1]
5771         add     %i0, 16, %i0            ! increment src ptr
5772         ldxa    [%i0-8]%asi, %o4
5773         stx     %o4, [%i1+8]
5774         andcc   %o3, 0x30, %o4
5775         brnz,pt %o4, .ci_al_mv_16
5776         add     %i1, 16, %i1            ! increment dst ptr
5777 ! Dest is aligned on 64 bytes, src 8 byte aligned
5778 .ci_al_to_64:
5779         ! Determine source alignment
5780         ! to correct 8 byte offset
5781         andcc   %i0, 32, %o3
5782         brnz,pn %o3, .ci_aln_1
5783         andcc   %i0, 16, %o3
5784         brnz,pn %o3, .ci_aln_01
5785         andcc   %i0, 8, %o3
5786         brz,pn  %o3, .ci_aln_000
5787         prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5788         ba      .ci_aln_001
5789         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5790 .ci_aln_01:
5791         brnz,pn %o3, .ci_aln_011
5792         prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5793         ba      .ci_aln_010
5794         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5795 .ci_aln_1:
5796         andcc   %i0, 16, %o3
5797         brnz,pn %o3, .ci_aln_11
5798         andcc   %i0, 8, %o3
5799         brnz,pn %o3, .ci_aln_101
5800         prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5801         ba      .ci_aln_100
5802         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5803 .ci_aln_11:
5804         brz,pn  %o3, .ci_aln_110
5805         prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5806 
5807 .ci_aln_111:
5808 ! Alignment off by 8 bytes
5809         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5810         ldda    [%i0]%asi, %d0
5811         add     %i0, 8, %i0
5812         sub     %i2, 8, %i2
5813         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5814         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5815         sub     %i1, %i0, %i1
5816 .ci_aln_111_loop:
5817         ldda    [%i0]ASI_BLK_AIUS,%d16          ! block load
5818         subcc   %o3, 64, %o3
5819         fmovd   %d16, %d2
5820         fmovd   %d18, %d4
5821         fmovd   %d20, %d6
5822         fmovd   %d22, %d8
5823         fmovd   %d24, %d10
5824         fmovd   %d26, %d12
5825         fmovd   %d28, %d14
5826         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5827         stda    %d0,[%i0+%i1]ASI_BLK_P
5828         add     %i0, 64, %i0
5829         fmovd   %d30, %d0
5830         bgt,pt  %ncc, .ci_aln_111_loop
5831         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5832         add     %i1, %i0, %i1
5833 
5834         std     %d0, [%i1]
5835         ba      .ci_remain_stuff
5836         add     %i1, 8, %i1
5837         ! END OF aln_111
5838 
5839 .ci_aln_110:
5840 ! Alignment off by 16 bytes
5841         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5842         ldda    [%i0]%asi, %d0
5843         ldda    [%i0+8]%asi, %d2
5844         add     %i0, 16, %i0
5845         sub     %i2, 16, %i2
5846         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5847         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5848         sub     %i1, %i0, %i1
5849 .ci_aln_110_loop:
5850         ldda    [%i0]ASI_BLK_AIUS,%d16          ! block load
5851         subcc   %o3, 64, %o3
5852         fmovd   %d16, %d4
5853         fmovd   %d18, %d6
5854         fmovd   %d20, %d8
5855         fmovd   %d22, %d10
5856         fmovd   %d24, %d12
5857         fmovd   %d26, %d14
5858         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5859         stda    %d0,[%i0+%i1]ASI_BLK_P
5860         add     %i0, 64, %i0
5861         fmovd   %d28, %d0
5862         fmovd   %d30, %d2
5863         bgt,pt  %ncc, .ci_aln_110_loop
5864         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5865         add     %i1, %i0, %i1
5866 
5867         std     %d0, [%i1]
5868         std     %d2, [%i1+8]
5869         ba      .ci_remain_stuff
5870         add     %i1, 16, %i1
5871         ! END OF aln_110
5872 
5873 .ci_aln_101:
5874 ! Alignment off by 24 bytes
5875         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5876         ldda    [%i0]%asi, %d0
5877         ldda    [%i0+8]%asi, %d2
5878         ldda    [%i0+16]%asi, %d4
5879         add     %i0, 24, %i0
5880         sub     %i2, 24, %i2
5881         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5882         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5883         sub     %i1, %i0, %i1
5884 .ci_aln_101_loop:
5885         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
5886         subcc   %o3, 64, %o3
5887         fmovd   %d16, %d6
5888         fmovd   %d18, %d8
5889         fmovd   %d20, %d10
5890         fmovd   %d22, %d12
5891         fmovd   %d24, %d14
5892         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5893         stda    %d0,[%i0+%i1]ASI_BLK_P
5894         add     %i0, 64, %i0
5895         fmovd   %d26, %d0
5896         fmovd   %d28, %d2
5897         fmovd   %d30, %d4
5898         bgt,pt  %ncc, .ci_aln_101_loop
5899         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5900         add     %i1, %i0, %i1
5901 
5902         std     %d0, [%i1]
5903         std     %d2, [%i1+8]
5904         std     %d4, [%i1+16]
5905         ba      .ci_remain_stuff
5906         add     %i1, 24, %i1
5907         ! END OF aln_101
5908 
5909 .ci_aln_100:
5910 ! Alignment off by 32 bytes
5911         ldda    [%i0]%asi, %d0
5912         ldda    [%i0+8]%asi, %d2
5913         ldda    [%i0+16]%asi,%d4
5914         ldda    [%i0+24]%asi,%d6
5915         add     %i0, 32, %i0
5916         sub     %i2, 32, %i2
5917         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5918         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5919         sub     %i1, %i0, %i1
5920 .ci_aln_100_loop:
5921         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
5922         subcc   %o3, 64, %o3
5923         fmovd   %d16, %d8
5924         fmovd   %d18, %d10
5925         fmovd   %d20, %d12
5926         fmovd   %d22, %d14
5927         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5928         stda    %d0,[%i0+%i1]ASI_BLK_P
5929         add     %i0, 64, %i0
5930         fmovd   %d24, %d0
5931         fmovd   %d26, %d2
5932         fmovd   %d28, %d4
5933         fmovd   %d30, %d6
5934         bgt,pt  %ncc, .ci_aln_100_loop
5935         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5936         add     %i1, %i0, %i1
5937 
5938         std     %d0, [%i1]
5939         std     %d2, [%i1+8]
5940         std     %d4, [%i1+16]
5941         std     %d6, [%i1+24]
5942         ba      .ci_remain_stuff
5943         add     %i1, 32, %i1
5944         ! END OF aln_100
5945 
5946 .ci_aln_011:
5947 ! Alignment off by 40 bytes
5948         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5949         ldda    [%i0]%asi, %d0
5950         ldda    [%i0+8]%asi, %d2
5951         ldda    [%i0+16]%asi, %d4
5952         ldda    [%i0+24]%asi, %d6
5953         ldda    [%i0+32]%asi, %d8
5954         add     %i0, 40, %i0
5955         sub     %i2, 40, %i2
5956         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5957         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5958         sub     %i1, %i0, %i1
5959 .ci_aln_011_loop:
5960         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
5961         subcc   %o3, 64, %o3
5962         fmovd   %d16, %d10
5963         fmovd   %d18, %d12
5964         fmovd   %d20, %d14
5965         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5966         stda    %d0,[%i0+%i1]ASI_BLK_P
5967         add     %i0, 64, %i0
5968         fmovd   %d22, %d0
5969         fmovd   %d24, %d2
5970         fmovd   %d26, %d4
5971         fmovd   %d28, %d6
5972         fmovd   %d30, %d8
5973         bgt,pt  %ncc, .ci_aln_011_loop
5974         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5975         add     %i1, %i0, %i1
5976 
5977         std     %d0, [%i1]
5978         std     %d2, [%i1+8]
5979         std     %d4, [%i1+16]
5980         std     %d6, [%i1+24]
5981         std     %d8, [%i1+32]
5982         ba      .ci_remain_stuff
5983         add     %i1, 40, %i1
5984         ! END OF aln_011
5985 
5986 .ci_aln_010:
5987 ! Alignment off by 48 bytes
5988         ldda    [%i0]%asi, %d0
5989         ldda    [%i0+8]%asi, %d2
5990         ldda    [%i0+16]%asi, %d4
5991         ldda    [%i0+24]%asi, %d6
5992         ldda    [%i0+32]%asi, %d8
5993         ldda    [%i0+40]%asi, %d10
5994         add     %i0, 48, %i0
5995         sub     %i2, 48, %i2
5996         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5997         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5998         sub     %i1, %i0, %i1
5999 .ci_aln_010_loop:
6000         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
6001         subcc   %o3, 64, %o3
6002         fmovd   %d16, %d12
6003         fmovd   %d18, %d14
6004         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6005         stda    %d0,[%i0+%i1]ASI_BLK_P
6006         add     %i0, 64, %i0
6007         fmovd   %d20, %d0
6008         fmovd   %d22, %d2
6009         fmovd   %d24, %d4
6010         fmovd   %d26, %d6
6011         fmovd   %d28, %d8
6012         fmovd   %d30, %d10
6013         bgt,pt  %ncc, .ci_aln_010_loop
6014         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6015         add     %i1, %i0, %i1
6016 
6017         std     %d0, [%i1]
6018         std     %d2, [%i1+8]
6019         std     %d4, [%i1+16]
6020         std     %d6, [%i1+24]
6021         std     %d8, [%i1+32]
6022         std     %d10, [%i1+40]
6023         ba      .ci_remain_stuff
6024         add     %i1, 48, %i1
6025         ! END OF aln_010
6026 
6027 .ci_aln_001:
6028 ! Alignment off by 56 bytes
6029         ldda    [%i0]%asi, %d0
6030         ldda    [%i0+8]%asi, %d2
6031         ldda    [%i0+16]%asi, %d4
6032         ldda    [%i0+24]%asi, %d6
6033         ldda    [%i0+32]%asi, %d8
6034         ldda    [%i0+40]%asi, %d10
6035         ldda    [%i0+48]%asi, %d12
6036         add     %i0, 56, %i0
6037         sub     %i2, 56, %i2
6038         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
6039         and     %i2, 0x7f, %i2          ! residue bytes in %i2
6040         sub     %i1, %i0, %i1
6041 .ci_aln_001_loop:
6042         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
6043         subcc   %o3, 64, %o3
6044         fmovd   %d16, %d14
6045         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6046         stda    %d0,[%i0+%i1]ASI_BLK_P
6047         add     %i0, 64, %i0
6048         fmovd   %d18, %d0
6049         fmovd   %d20, %d2
6050         fmovd   %d22, %d4
6051         fmovd   %d24, %d6
6052         fmovd   %d26, %d8
6053         fmovd   %d28, %d10
6054         fmovd   %d30, %d12
6055         bgt,pt  %ncc, .ci_aln_001_loop
6056         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6057         add     %i1, %i0, %i1
6058 
6059         std     %d0, [%i1]
6060         std     %d2, [%i1+8]
6061         std     %d4, [%i1+16]
6062         std     %d6, [%i1+24]
6063         std     %d8, [%i1+32]
6064         std     %d10, [%i1+40]
6065         std     %d12, [%i1+48]
6066         ba      .ci_remain_stuff
6067         add     %i1, 56, %i1
6068         ! END OF aln_001
6069 
6070 .ci_aln_000:
6071         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6072         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
6073         and     %i2, 0x7f, %i2          ! residue bytes in %i2
6074         sub     %i1, %i0, %i1
6075 .ci_aln_000_loop:
6076         ldda    [%i0]ASI_BLK_AIUS,%d0
6077         subcc   %o3, 64, %o3
6078         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
6079         stda    %d0,[%i0+%i1]ASI_BLK_P
6080         add     %i0, 64, %i0
6081         bgt,pt  %ncc, .ci_aln_000_loop
6082         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6083         add     %i1, %i0, %i1
6084 
6085         ! END OF aln_000
6086 
6087 .ci_remain_stuff:
6088         subcc   %i2, 31, %i2            ! adjust length to allow cc test
6089         ble,pt  %ncc, .ci_aln_31
6090         nop
6091 .ci_aln_32:
6092         ldxa    [%i0]%asi, %o4          ! move 32 bytes
6093         subcc   %i2, 32, %i2            ! decrement length count by 32
6094         stx     %o4, [%i1]
6095         ldxa    [%i0+8]%asi, %o4
6096         stx     %o4, [%i1+8]
6097         ldxa    [%i0+16]%asi, %o4
6098         add     %i0, 32, %i0            ! increase src ptr by 32
6099         stx     %o4, [%i1+16]
6100         ldxa    [%i0-8]%asi, %o4
6101         add     %i1, 32, %i1            ! increase dst ptr by 32
6102         bgu,pt  %ncc, .ci_aln_32        ! repeat if at least 32 bytes left
6103         stx     %o4, [%i1-8]
6104 .ci_aln_31:
6105         addcc   %i2, 24, %i2            ! adjust count to be off by 7
6106         ble,pt  %ncc, .ci_aln_7         ! skip if 7 or fewer bytes left
6107         nop                             !
6108 .ci_aln_15:
6109         ldxa    [%i0]%asi, %o4          ! move 8 bytes
6110         add     %i0, 8, %i0             ! increase src ptr by 8
6111         subcc   %i2, 8, %i2             ! decrease count by 8
6112         add     %i1, 8, %i1             ! increase dst ptr by 8
6113         bgu,pt  %ncc, .ci_aln_15
6114         stx     %o4, [%i1-8]            !
6115 .ci_aln_7:
6116         addcc   %i2, 7, %i2             ! finish adjustment of remaining count
6117         bz,pt   %ncc, .ci_exit          ! exit if finished
6118         cmp     %i2, 4
6119         blt,pt  %ncc, .ci_unaln3x       ! skip if less than 4 bytes left
6120         nop                             !
6121         lda     [%i0]%asi, %o4          ! move 4 bytes
6122         add     %i0, 4, %i0             ! increase src ptr by 4
6123         add     %i1, 4, %i1             ! increase dst ptr by 4
6124         subcc   %i2, 4, %i2             ! decrease count by 4
6125         bnz     .ci_unaln3x
6126         stw     %o4, [%i1-4]
6127         ba      .ci_exit
6128         nop
6129 
6130         ! destination alignment code
6131 .ci_big_d1:
6132         lduba   [%i0]%asi, %o4          ! move a byte
6133         add     %i0, 1, %i0
6134         stb     %o4, [%i1]
6135         add     %i1, 1, %i1
6136         andcc   %i1, 2, %o3
6137         bz,pt   %ncc, .ci_big_d2f
6138         sub     %i2, 1, %i2
6139 .ci_big_d2:                             ! dest is now at least half word aligned
6140         lduba   [%i0]%asi, %o4          ! move a half-word (src align unknown)
6141         lduba   [%i0+1]%asi, %o3
6142         add     %i0, 2, %i0
6143         sll     %o4, 8, %o4             ! position
6144         or      %o4, %o3, %o4           ! merge
6145         sth     %o4, [%i1]
6146         add     %i1, 2, %i1
6147         andcc   %i1, 4, %o3
6148         bz,pt   %ncc, .ci_big_d4f
6149         sub     %i2, 2, %i2
6150 .ci_big_d4:                             ! dest is at least word aligned
6151         nop
6152         lduba   [%i0]%asi, %o4          ! move a word (src align unknown)
6153         lduba   [%i0+1]%asi, %o3
6154         sll     %o4, 24, %o4            ! position
6155         sll     %o3, 16, %o3            ! position
6156         or      %o4, %o3, %o3           ! merge
6157         lduba   [%i0+2]%asi, %o4
6158         sll     %o4, 8, %o4             ! position
6159         or      %o4, %o3, %o3           ! merge
6160         lduba   [%i0+3]%asi, %o4
6161         or      %o4, %o3, %o4           ! merge
6162         stw     %o4,[%i1]               ! store four bytes
6163         add     %i0, 4, %i0             ! adjust src by 4
6164         add     %i1, 4, %i1             ! adjust dest by 4
6165         ba      .ci_big_d4f
6166         sub     %i2, 4, %i2             ! adjust count by 4
6167 
6168 
6169         ! Dst is on 8 byte boundary; src is not;
6170 .ci_big_unal8:
6171         andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
6172         bz      %ncc, .ci_unalnsrc
6173         sub     %o3, 64, %o3            ! %o3 will be multiple of 8
6174         neg     %o3                     ! bytes until dest is 64 byte aligned
6175         sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
6176         ! Move bytes according to source alignment
6177         andcc   %i0, 0x1, %o4
6178         bnz     %ncc, .ci_unalnbyte     ! check for byte alignment
6179         nop
6180         andcc   %i0, 2, %o4             ! check for half word alignment
6181         bnz     %ncc, .ci_unalnhalf
6182         nop
6183         ! Src is word aligned, move bytes until dest 64 byte aligned
6184 .ci_unalnword:
6185         lda     [%i0]%asi, %o4          ! load 4 bytes
6186         stw     %o4, [%i1]              ! and store 4 bytes
6187         lda     [%i0+4]%asi, %o4        ! load 4 bytes
6188         add     %i0, 8, %i0             ! increase src ptr by 8
6189         stw     %o4, [%i1+4]            ! and store 4 bytes
6190         subcc   %o3, 8, %o3             ! decrease count by 8
6191         bnz     %ncc, .ci_unalnword
6192         add     %i1, 8, %i1             ! increase dst ptr by 8
6193         ba      .ci_unalnsrc
6194         nop
6195 
6196         ! Src is half-word aligned, move bytes until dest 64 byte aligned
6197 .ci_unalnhalf:
6198         lduha   [%i0]%asi, %o4          ! load 2 bytes
6199         sllx    %o4, 32, %i3            ! shift left
6200         lduwa   [%i0+2]%asi, %o4
6201         or      %o4, %i3, %i3
6202         sllx    %i3, 16, %i3
6203         lduha   [%i0+6]%asi, %o4
6204         or      %o4, %i3, %i3
6205         stx     %i3, [%i1]
6206         add     %i0, 8, %i0
6207         subcc   %o3, 8, %o3
6208         bnz     %ncc, .ci_unalnhalf
6209         add     %i1, 8, %i1
6210         ba      .ci_unalnsrc
6211         nop
6212 
6213         ! Src is Byte aligned, move bytes until dest 64 byte aligned
6214 .ci_unalnbyte:
6215         sub     %i1, %i0, %i1           ! share pointer advance
6216 .ci_unalnbyte_loop:
6217         lduba   [%i0]%asi, %o4
6218         sllx    %o4, 56, %i3
6219         lduha   [%i0+1]%asi, %o4
6220         sllx    %o4, 40, %o4
6221         or      %o4, %i3, %i3
6222         lduha   [%i0+3]%asi, %o4
6223         sllx    %o4, 24, %o4
6224         or      %o4, %i3, %i3
6225         lduha   [%i0+5]%asi, %o4
6226         sllx    %o4, 8, %o4
6227         or      %o4, %i3, %i3
6228         lduba   [%i0+7]%asi, %o4
6229         or      %o4, %i3, %i3
6230         stx     %i3, [%i1+%i0]
6231         subcc   %o3, 8, %o3
6232         bnz     %ncc, .ci_unalnbyte_loop
6233         add     %i0, 8, %i0
6234         add     %i1,%i0, %i1            ! restore pointer
6235 
6236         ! Destination is now block (64 byte aligned), src is not 8 byte aligned
6237 .ci_unalnsrc:
6238         andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
6239         and     %i2, 0x3f, %i2          ! residue bytes in %i2
6240         add     %i2, 64, %i2            ! Insure we don't load beyond
6241         sub     %i3, 64, %i3            ! end of source buffer
6242 
6243         andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
6244         prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
6245         alignaddr %i0, %g0, %g0         ! generate %gsr
6246         add     %i0, %i3, %i0           ! advance %i0 to after blocks
6247         !
6248         ! Determine source alignment to correct 8 byte offset
6249         andcc   %i0, 0x20, %o3
6250         brnz,pn %o3, .ci_unaln_1
6251         andcc   %i0, 0x10, %o3
6252         brnz,pn %o3, .ci_unaln_01
6253         andcc   %i0, 0x08, %o3
6254         brz,a   %o3, .ci_unaln_000
6255         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6256         ba      .ci_unaln_001
6257         nop
6258 .ci_unaln_01:
6259         brnz,a  %o3, .ci_unaln_011
6260         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6261         ba      .ci_unaln_010
6262         nop
6263 .ci_unaln_1:
6264         brnz,pn %o3, .ci_unaln_11
6265         andcc   %i0, 0x08, %o3
6266         brnz,a  %o3, .ci_unaln_101
6267         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6268         ba      .ci_unaln_100
6269         nop
6270 .ci_unaln_11:
6271         brz,pn  %o3, .ci_unaln_110
6272         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6273 
6274 .ci_unaln_111:
6275         ldda    [%o4+56]%asi, %d14
6276 .ci_unaln_111_loop:
6277         add     %o4, 64, %o4
6278         ldda    [%o4]ASI_BLK_AIUS, %d16
6279         faligndata %d14, %d16, %d48
6280         faligndata %d16, %d18, %d50
6281         faligndata %d18, %d20, %d52
6282         faligndata %d20, %d22, %d54
6283         faligndata %d22, %d24, %d56
6284         faligndata %d24, %d26, %d58
6285         faligndata %d26, %d28, %d60
6286         faligndata %d28, %d30, %d62
6287         fmovd   %d30, %d14
6288         stda    %d48, [%i1]ASI_BLK_P
6289         subcc   %i3, 64, %i3
6290         add     %i1, 64, %i1
6291         bgu,pt  %ncc, .ci_unaln_111_loop
6292         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6293         ba      .ci_unaln_done
6294         nop
6295 
6296 .ci_unaln_110:
6297         ldda    [%o4+48]%asi, %d12
6298         ldda    [%o4+56]%asi, %d14
6299 .ci_unaln_110_loop:
6300         add     %o4, 64, %o4
6301         ldda    [%o4]ASI_BLK_AIUS, %d16
6302         faligndata %d12, %d14, %d48
6303         faligndata %d14, %d16, %d50
6304         faligndata %d16, %d18, %d52
6305         faligndata %d18, %d20, %d54
6306         faligndata %d20, %d22, %d56
6307         faligndata %d22, %d24, %d58
6308         faligndata %d24, %d26, %d60
6309         faligndata %d26, %d28, %d62
6310         fmovd   %d28, %d12
6311         fmovd   %d30, %d14
6312         stda    %d48, [%i1]ASI_BLK_P
6313         subcc   %i3, 64, %i3
6314         add     %i1, 64, %i1
6315         bgu,pt  %ncc, .ci_unaln_110_loop
6316         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6317         ba      .ci_unaln_done
6318         nop
6319 
6320 .ci_unaln_101:
6321         ldda    [%o4+40]%asi, %d10
6322         ldda    [%o4+48]%asi, %d12
6323         ldda    [%o4+56]%asi, %d14
6324 .ci_unaln_101_loop:
6325         add     %o4, 64, %o4
6326         ldda    [%o4]ASI_BLK_AIUS, %d16
6327         faligndata %d10, %d12, %d48
6328         faligndata %d12, %d14, %d50
6329         faligndata %d14, %d16, %d52
6330         faligndata %d16, %d18, %d54
6331         faligndata %d18, %d20, %d56
6332         faligndata %d20, %d22, %d58
6333         faligndata %d22, %d24, %d60
6334         faligndata %d24, %d26, %d62
6335         fmovd   %d26, %d10
6336         fmovd   %d28, %d12
6337         fmovd   %d30, %d14
6338         stda    %d48, [%i1]ASI_BLK_P
6339         subcc   %i3, 64, %i3
6340         add     %i1, 64, %i1
6341         bgu,pt  %ncc, .ci_unaln_101_loop
6342         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6343         ba      .ci_unaln_done
6344         nop
6345 
6346 .ci_unaln_100:
6347         ldda    [%o4+32]%asi, %d8
6348         ldda    [%o4+40]%asi, %d10
6349         ldda    [%o4+48]%asi, %d12
6350         ldda    [%o4+56]%asi, %d14
6351 .ci_unaln_100_loop:
6352         add     %o4, 64, %o4
6353         ldda    [%o4]ASI_BLK_AIUS, %d16
6354         faligndata %d8, %d10, %d48
6355         faligndata %d10, %d12, %d50
6356         faligndata %d12, %d14, %d52
6357         faligndata %d14, %d16, %d54
6358         faligndata %d16, %d18, %d56
6359         faligndata %d18, %d20, %d58
6360         faligndata %d20, %d22, %d60
6361         faligndata %d22, %d24, %d62
6362         fmovd   %d24, %d8
6363         fmovd   %d26, %d10
6364         fmovd   %d28, %d12
6365         fmovd   %d30, %d14
6366         stda    %d48, [%i1]ASI_BLK_P
6367         subcc   %i3, 64, %i3
6368         add     %i1, 64, %i1
6369         bgu,pt  %ncc, .ci_unaln_100_loop
6370         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6371         ba      .ci_unaln_done
6372         nop
6373 
6374 .ci_unaln_011:
6375         ldda    [%o4+24]%asi, %d6
6376         ldda    [%o4+32]%asi, %d8
6377         ldda    [%o4+40]%asi, %d10
6378         ldda    [%o4+48]%asi, %d12
6379         ldda    [%o4+56]%asi, %d14
6380 .ci_unaln_011_loop:
6381         add     %o4, 64, %o4
6382         ldda    [%o4]ASI_BLK_AIUS, %d16
6383         faligndata %d6, %d8, %d48
6384         faligndata %d8, %d10, %d50
6385         faligndata %d10, %d12, %d52
6386         faligndata %d12, %d14, %d54
6387         faligndata %d14, %d16, %d56
6388         faligndata %d16, %d18, %d58
6389         faligndata %d18, %d20, %d60
6390         faligndata %d20, %d22, %d62
6391         fmovd   %d22, %d6
6392         fmovd   %d24, %d8
6393         fmovd   %d26, %d10
6394         fmovd   %d28, %d12
6395         fmovd   %d30, %d14
6396         stda    %d48, [%i1]ASI_BLK_P
6397         subcc   %i3, 64, %i3
6398         add     %i1, 64, %i1
6399         bgu,pt  %ncc, .ci_unaln_011_loop
6400         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6401         ba      .ci_unaln_done
6402         nop
6403 
6404 .ci_unaln_010:
6405         ldda    [%o4+16]%asi, %d4
6406         ldda    [%o4+24]%asi, %d6
6407         ldda    [%o4+32]%asi, %d8
6408         ldda    [%o4+40]%asi, %d10
6409         ldda    [%o4+48]%asi, %d12
6410         ldda    [%o4+56]%asi, %d14
6411 .ci_unaln_010_loop:
6412         add     %o4, 64, %o4
6413         ldda    [%o4]ASI_BLK_AIUS, %d16
6414         faligndata %d4, %d6, %d48
6415         faligndata %d6, %d8, %d50
6416         faligndata %d8, %d10, %d52
6417         faligndata %d10, %d12, %d54
6418         faligndata %d12, %d14, %d56
6419         faligndata %d14, %d16, %d58
6420         faligndata %d16, %d18, %d60
6421         faligndata %d18, %d20, %d62
6422         fmovd   %d20, %d4
6423         fmovd   %d22, %d6
6424         fmovd   %d24, %d8
6425         fmovd   %d26, %d10
6426         fmovd   %d28, %d12
6427         fmovd   %d30, %d14
6428         stda    %d48, [%i1]ASI_BLK_P
6429         subcc   %i3, 64, %i3
6430         add     %i1, 64, %i1
6431         bgu,pt  %ncc, .ci_unaln_010_loop
6432         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6433         ba      .ci_unaln_done
6434         nop
6435 
6436 .ci_unaln_001:
6437         ldda    [%o4+8]%asi, %d2
6438         ldda    [%o4+16]%asi, %d4
6439         ldda    [%o4+24]%asi, %d6
6440         ldda    [%o4+32]%asi, %d8
6441         ldda    [%o4+40]%asi, %d10
6442         ldda    [%o4+48]%asi, %d12
6443         ldda    [%o4+56]%asi, %d14
6444 .ci_unaln_001_loop:
6445         add     %o4, 64, %o4
6446         ldda    [%o4]ASI_BLK_AIUS, %d16
6447         faligndata %d2, %d4, %d48
6448         faligndata %d4, %d6, %d50
6449         faligndata %d6, %d8, %d52
6450         faligndata %d8, %d10, %d54
6451         faligndata %d10, %d12, %d56
6452         faligndata %d12, %d14, %d58
6453         faligndata %d14, %d16, %d60
6454         faligndata %d16, %d18, %d62
6455         fmovd   %d18, %d2
6456         fmovd   %d20, %d4
6457         fmovd   %d22, %d6
6458         fmovd   %d24, %d8
6459         fmovd   %d26, %d10
6460         fmovd   %d28, %d12
6461         fmovd   %d30, %d14
6462         stda    %d48, [%i1]ASI_BLK_P
6463         subcc   %i3, 64, %i3
6464         add     %i1, 64, %i1
6465         bgu,pt  %ncc, .ci_unaln_001_loop
6466         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6467         ba      .ci_unaln_done
6468         nop
6469 
6470 .ci_unaln_000:
6471         ldda    [%o4]ASI_BLK_AIUS, %d0
6472 .ci_unaln_000_loop:
6473         add     %o4, 64, %o4
6474         ldda    [%o4]ASI_BLK_AIUS, %d16
6475         faligndata %d0, %d2, %d48
6476         faligndata %d2, %d4, %d50
6477         faligndata %d4, %d6, %d52
6478         faligndata %d6, %d8, %d54
6479         faligndata %d8, %d10, %d56
6480         faligndata %d10, %d12, %d58
6481         faligndata %d12, %d14, %d60
6482         faligndata %d14, %d16, %d62
6483         fmovd   %d16, %d0
6484         fmovd   %d18, %d2
6485         fmovd   %d20, %d4
6486         fmovd   %d22, %d6
6487         fmovd   %d24, %d8
6488         fmovd   %d26, %d10
6489         fmovd   %d28, %d12
6490         fmovd   %d30, %d14
6491         stda    %d48, [%i1]ASI_BLK_P
6492         subcc   %i3, 64, %i3
6493         add     %i1, 64, %i1
6494         bgu,pt  %ncc, .ci_unaln_000_loop
6495         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6496 
6497 .ci_unaln_done:
6498         ! Handle trailing bytes, 64 to 127
6499         ! Dest long word aligned, Src not long word aligned
6500         cmp     %i2, 15
6501         bleu    %ncc, .ci_unaln_short
6502 
6503         andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
6504         and     %i2, 0x7, %i2           ! residue bytes in %i2
6505         add     %i2, 8, %i2
6506         sub     %i3, 8, %i3             ! insure we don't load past end of src
6507         andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
6508         add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
6509         ldda    [%o4]%asi, %d0          ! fetch partial word
6510 .ci_unaln_by8:
6511         ldda    [%o4+8]%asi, %d2
6512         add     %o4, 8, %o4
6513         faligndata %d0, %d2, %d16
6514         subcc   %i3, 8, %i3
6515         std     %d16, [%i1]
6516         fmovd   %d2, %d0
6517         bgu,pt  %ncc, .ci_unaln_by8
6518         add     %i1, 8, %i1
6519 
6520 .ci_unaln_short:
6521         cmp     %i2, 8
6522         blt,pt  %ncc, .ci_unalnfin
6523         nop
6524         lduba   [%i0]%asi, %o4
6525         sll     %o4, 24, %o3
6526         lduba   [%i0+1]%asi, %o4
6527         sll     %o4, 16, %o4
6528         or      %o4, %o3, %o3
6529         lduba   [%i0+2]%asi, %o4
6530         sll     %o4, 8, %o4
6531         or      %o4, %o3, %o3
6532         lduba   [%i0+3]%asi, %o4
6533         or      %o4, %o3, %o3
6534         stw     %o3, [%i1]
6535         lduba   [%i0+4]%asi, %o4
6536         sll     %o4, 24, %o3
6537         lduba   [%i0+5]%asi, %o4
6538         sll     %o4, 16, %o4
6539         or      %o4, %o3, %o3
6540         lduba   [%i0+6]%asi, %o4
6541         sll     %o4, 8, %o4
6542         or      %o4, %o3, %o3
6543         lduba   [%i0+7]%asi, %o4
6544         or      %o4, %o3, %o3
6545         stw     %o3, [%i1+4]
6546         add     %i0, 8, %i0
6547         add     %i1, 8, %i1
6548         sub     %i2, 8, %i2
6549 .ci_unalnfin:
6550         cmp     %i2, 4
6551         blt,pt  %ncc, .ci_unalnz
6552         tst     %i2
6553         lduba   [%i0]%asi, %o3          ! read byte
6554         subcc   %i2, 4, %i2             ! reduce count by 4
6555         sll     %o3, 24, %o3            ! position
6556         lduba   [%i0+1]%asi, %o4
6557         sll     %o4, 16, %o4            ! position
6558         or      %o4, %o3, %o3           ! merge
6559         lduba   [%i0+2]%asi, %o4
6560         sll     %o4, 8, %o4             ! position
6561         or      %o4, %o3, %o3           ! merge
6562         add     %i1, 4, %i1             ! advance dst by 4
6563         lduba   [%i0+3]%asi, %o4
6564         add     %i0, 4, %i0             ! advance src by 4
6565         or      %o4, %o3, %o4           ! merge
6566         bnz,pt  %ncc, .ci_unaln3x
6567         stw     %o4, [%i1-4]
6568         ba      .ci_exit
6569         nop
6570 .ci_unalnz:
6571         bz,pt   %ncc, .ci_exit
6572         wr      %l5, %g0, %gsr          ! restore %gsr
6573 .ci_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
6574         subcc   %i2, 1, %i2             ! reduce count for cc test
6575         lduba   [%i0]%asi, %o4          ! load one byte
6576         bz,pt   %ncc, .ci_exit
6577         stb     %o4, [%i1]              ! store one byte
6578         lduba   [%i0+1]%asi, %o4        ! load second byte
6579         subcc   %i2, 1, %i2
6580         bz,pt   %ncc, .ci_exit
6581         stb     %o4, [%i1+1]            ! store second byte
6582         lduba   [%i0+2]%asi, %o4        ! load third byte
6583         stb     %o4, [%i1+2]            ! store third byte
6584 .ci_exit:
6585         brnz    %g1, .ci_fp_restore
6586         nop
6587         FZERO
6588         wr      %g1, %g0, %fprs
6589         ba,pt   %ncc, .ci_ex2
6590         membar  #Sync
6591 .ci_fp_restore:
6592         BLD_FP_FROMSTACK(%o4)
6593 .ci_ex2:
6594         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
6595         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6596         ret
6597         restore %g0, 0, %o0
6598 
6599 .copyin_err:
6600         ldn     [THREAD_REG + T_COPYOPS], %o4
6601         brz     %o4, 2f
6602         nop
6603         ldn     [%o4 + CP_COPYIN], %g2
6604         jmp     %g2
6605         nop
6606 2:
6607         retl
6608         mov     -1, %o0
6609 
6610 #else   /* NIAGARA_IMPL */
6611 .do_copyin:
6612         !
6613         ! Check the length and bail if zero.
6614         !
6615         tst     %o2
6616         bnz,pt  %ncc, 1f
6617         nop
6618         retl
6619         clr     %o0
6620 1:
6621         sethi   %hi(copyio_fault), %o4
6622         or      %o4, %lo(copyio_fault), %o4
6623         sethi   %hi(copyio_fault_nowindow), %o3
6624         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
6625         or      %o3, %lo(copyio_fault_nowindow), %o3
6626         membar  #Sync
6627         stn     %o3, [THREAD_REG + T_LOFAULT]
6628 
6629         mov     %o0, SAVE_SRC
6630         mov     %o1, SAVE_DST
6631         mov     %o2, SAVE_COUNT
6632 
6633         !
6634         ! Check to see if we're more than SMALL_LIMIT.
6635         !
6636         subcc   %o2, SMALL_LIMIT, %o3
6637         bgu,a,pt %ncc, .dci_ns
6638         or      %o0, %o1, %o3
6639         !
6640         ! What was previously ".small_copyin"
6641         !
6642 .dcibcp:
6643         sub     %g0, %o2, %o3           ! setup for copy loop
6644         add     %o0, %o2, %o0
6645         add     %o1, %o2, %o1
6646         ba,pt   %ncc, .dcicl
6647         lduba   [%o0 + %o3]ASI_USER, %o4
6648         !
6649         ! %o0 and %o1 point at the end and remain pointing at the end
6650         ! of their buffers. We pull things out by adding %o3 (which is
6651         ! the negation of the length) to the buffer end which gives us
6652         ! the curent location in the buffers. By incrementing %o3 we walk
6653         ! through both buffers without having to bump each buffer's
6654         ! pointer. A very fast 4 instruction loop.
6655         !
6656         .align 16
6657 .dcicl:
6658         stb     %o4, [%o1 + %o3]
6659         inccc   %o3
6660         bl,a,pt %ncc, .dcicl
6661         lduba   [%o0 + %o3]ASI_USER, %o4
6662         !
6663         ! We're done. Go home.
6664         !       
6665         membar  #Sync
6666         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
6667         retl
6668         clr     %o0
6669         !
6670         ! Try aligned copies from here.
6671         !
6672 .dci_ns:
6673         !
6674         ! See if we're single byte aligned. If we are, check the
6675         ! limit for single byte copies. If we're smaller, or equal,
6676         ! bounce to the byte for byte copy loop. Otherwise do it in
6677         ! HW (if enabled).
6678         !
6679         btst    1, %o3
6680         bz,a,pt %icc, .dcih8
6681         btst    7, %o3
6682         !
6683         ! We're single byte aligned.
6684         !
6685         sethi   %hi(hw_copy_limit_1), %o3
6686         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
6687         !
6688         ! Is HW copy on? If not do everything byte for byte.
6689         !
6690         tst     %o3
6691         bz,pn   %icc, .dcibcp
6692         subcc   %o3, %o2, %o3
6693         !
6694         ! Are we bigger than the HW limit? If not
6695         ! go to byte for byte.
6696         !
6697         bge,pt  %ncc, .dcibcp
6698         nop
6699         !
6700         ! We're big enough and copy is on. Do it with HW.
6701         !
6702         ba,pt   %ncc, .big_copyin
6703         nop
6704 .dcih8:
6705         !
6706         ! 8 byte aligned?
6707         !
6708         bnz,a   %ncc, .dcih4
6709         btst    3, %o3
6710         !
6711         ! We're eight byte aligned.
6712         !
6713         sethi   %hi(hw_copy_limit_8), %o3
6714         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
6715         !
6716         ! Is HW assist on? If not, do it with the aligned copy.
6717         !
6718         tst     %o3
6719         bz,pn   %icc, .dcis8
6720         subcc   %o3, %o2, %o3
6721         bge     %ncc, .dcis8
6722         nop
6723         ba,pt   %ncc, .big_copyin
6724         nop
6725 .dcis8:
6726         !
6727         ! Housekeeping for copy loops. Uses same idea as in the byte for
6728         ! byte copy loop above.
6729         !
6730         add     %o0, %o2, %o0
6731         add     %o1, %o2, %o1
6732         sub     %g0, %o2, %o3
6733         ba,pt   %ncc, .didebc
6734         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
6735         !
6736         ! 4 byte aligned?
6737         !
6738 .dcih4:
6739         bnz     %ncc, .dcih2
6740         sethi   %hi(hw_copy_limit_4), %o3
6741         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
6742         !
6743         ! Is HW assist on? If not, do it with the aligned copy.
6744         !
6745         tst     %o3
6746         bz,pn   %icc, .dcis4
6747         subcc   %o3, %o2, %o3
6748         !
6749         ! We're negative if our size is less than or equal to hw_copy_limit_4.
6750         !
6751         bge     %ncc, .dcis4
6752         nop
6753         ba,pt   %ncc, .big_copyin
6754         nop
6755 .dcis4:
6756         !
6757         ! Housekeeping for copy loops. Uses same idea as in the byte
6758         ! for byte copy loop above.
6759         !
6760         add     %o0, %o2, %o0
6761         add     %o1, %o2, %o1
6762         sub     %g0, %o2, %o3
6763         ba,pt   %ncc, .didfbc
6764         srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
6765 .dcih2:
6766         !
6767         ! We're two byte aligned. Check for "smallness"
6768         ! done in delay at .dcih4
6769         !
6770         bleu,pt %ncc, .dcis2
6771         sethi   %hi(hw_copy_limit_2), %o3
6772         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
6773         !
6774         ! Is HW assist on? If not, do it with the aligned copy.
6775         !
6776         tst     %o3
6777         bz,pn   %icc, .dcis2
6778         subcc   %o3, %o2, %o3
6779         !
6780         ! Are we larger than the HW limit?
6781         !
6782         bge     %ncc, .dcis2
6783         nop
6784         !
6785         ! HW assist is on and we're large enough to use it.
6786         !
6787         ba,pt   %ncc, .big_copyin
6788         nop
6789         !
6790         ! Housekeeping for copy loops. Uses same idea as in the byte
6791         ! for byte copy loop above.
6792         !
6793 .dcis2:
6794         add     %o0, %o2, %o0
6795         add     %o1, %o2, %o1
6796         sub     %g0, %o2, %o3
6797         ba,pt   %ncc, .didtbc
6798         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
6799         !
6800 .small_copyin:
6801         !
6802         ! Why are we doing this AGAIN? There are certain conditions in
6803         ! big copyin that will cause us to forgo the HW assisted copys
6804         ! and bounce back to a non-hw assisted copy. This dispatches
6805         ! those copies. Note that we branch around this in the main line
6806         ! code.
6807         !
6808         ! We make no check for limits or HW enablement here. We've
6809         ! already been told that we're a poster child so just go off
6810         ! and do it.
6811         !
6812         or      %o0, %o1, %o3
6813         btst    1, %o3
6814         bnz     %icc, .dcibcp           ! Most likely
6815         btst    7, %o3
6816         bz      %icc, .dcis8
6817         btst    3, %o3
6818         bz      %icc, .dcis4
6819         nop
6820         ba,pt   %ncc, .dcis2
6821         nop
6822         !
6823         ! Eight byte aligned copies. A steal from the original .small_copyin
6824         ! with modifications. %o2 is number of 8 byte chunks to copy. When
6825         ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
6826         ! to copy.
6827         !
6828         .align 32
6829 .didebc:
6830         ldxa    [%o0 + %o3]ASI_USER, %o4
6831         deccc   %o2
6832         stx     %o4, [%o1 + %o3]
6833         bg,pt   %ncc, .didebc
6834         addcc   %o3, 8, %o3
6835         !
6836         ! End of copy loop. Most 8 byte aligned copies end here.
6837         !
6838         bz,pt   %ncc, .dcifh
6839         nop
6840         !
6841         ! Something is left. Do it byte for byte.
6842         !
6843         ba,pt   %ncc, .dcicl
6844         lduba   [%o0 + %o3]ASI_USER, %o4
6845         !
6846         ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
6847         !
6848         .align 32
6849 .didfbc:
6850         lduwa   [%o0 + %o3]ASI_USER, %o4
6851         deccc   %o2
6852         st      %o4, [%o1 + %o3]
6853         bg,pt   %ncc, .didfbc
6854         addcc   %o3, 4, %o3
6855         !
6856         ! End of copy loop. Most 4 byte aligned copies end here.
6857         !
6858         bz,pt   %ncc, .dcifh
6859         nop
6860         !
6861         ! Something is left. Do it byte for byte.
6862         !
6863         ba,pt   %ncc, .dcicl
6864         lduba   [%o0 + %o3]ASI_USER, %o4
6865         !
6866         ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
6867         ! copy.
6868         !
6869         .align 32
6870 .didtbc:
6871         lduha   [%o0 + %o3]ASI_USER, %o4
6872         deccc   %o2
6873         sth     %o4, [%o1 + %o3]
6874         bg,pt   %ncc, .didtbc
6875         addcc   %o3, 2, %o3
6876         !
6877         ! End of copy loop. Most 2 byte aligned copies end here.
6878         !
6879         bz,pt   %ncc, .dcifh
6880         nop
6881         !
6882         ! Deal with the last byte
6883         !
6884         lduba   [%o0 + %o3]ASI_USER, %o4
6885         stb     %o4, [%o1 + %o3]
6886 .dcifh:
6887         membar  #Sync
6888         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6889         retl
6890         clr     %o0
6891 
6892 .big_copyin:
6893         ! We're going off to do a block copy.
6894         ! Switch fault hendlers and grab a window. We
6895         ! don't do a membar #Sync since we've done only
6896         ! kernel data to this point.
6897         stn     %o4, [THREAD_REG + T_LOFAULT]
6898 
6899         ! Copy in that reach here are larger than 256 bytes. The
6900         ! hw_copy_limit_1 is set to 256. Never set this limit less
6901         ! 128 bytes.
6902         save    %sp, -SA(MINFRAME), %sp
6903 .do_blockcopyin:
6904 
6905         ! Swap src/dst since the code below is memcpy code
6906         ! and memcpy/bcopy have different calling sequences
6907         mov     %i1, %i5
6908         mov     %i0, %i1
6909         mov     %i5, %i0
6910 
6911         ! Block (64 bytes) align the destination.
6912         andcc   %i0, 0x3f, %i3          ! is dst block aligned
6913         bz      %ncc, copyin_blalign    ! dst already block aligned
6914         sub     %i3, 0x40, %i3
6915         neg     %i3                     ! bytes till dst 64 bytes aligned
6916         sub     %i2, %i3, %i2           ! update i2 with new count
6917 
6918         ! Based on source and destination alignment do
6919         ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
6920 
6921         ! Is dst & src 8B aligned
6922         or      %i0, %i1, %o2
6923         andcc   %o2, 0x7, %g0
6924         bz      %ncc, .ci_alewdcp
6925         nop
6926 
6927         ! Is dst & src 4B aligned
6928         andcc   %o2, 0x3, %g0
6929         bz      %ncc, .ci_alwdcp
6930         nop
6931 
6932         ! Is dst & src 2B aligned
6933         andcc   %o2, 0x1, %g0
6934         bz      %ncc, .ci_alhlfwdcp
6935         nop
6936 
6937         ! 1B aligned
6938 1:      lduba   [%i1]ASI_USER, %o2
6939         stb     %o2, [%i0]
6940         inc     %i1
6941         deccc   %i3
6942         bgu,pt  %ncc, 1b
6943         inc     %i0
6944 
6945         ba      copyin_blalign
6946         nop
6947 
6948         ! dst & src 4B aligned
6949 .ci_alwdcp:
6950         lda     [%i1]ASI_USER, %o2
6951         st      %o2, [%i0]
6952         add     %i1, 0x4, %i1
6953         subcc   %i3, 0x4, %i3
6954         bgu,pt  %ncc, .ci_alwdcp
6955         add     %i0, 0x4, %i0
6956 
6957         ba      copyin_blalign
6958         nop
6959 
6960         ! dst & src 2B aligned
6961 .ci_alhlfwdcp:
6962         lduha   [%i1]ASI_USER, %o2
6963         stuh    %o2, [%i0]
6964         add     %i1, 0x2, %i1
6965         subcc   %i3, 0x2, %i3
6966         bgu,pt  %ncc, .ci_alhlfwdcp
6967         add     %i0, 0x2, %i0
6968 
6969         ba      copyin_blalign
6970         nop
6971 
6972         ! dst & src 8B aligned
6973 .ci_alewdcp:
6974         ldxa    [%i1]ASI_USER, %o2
6975         stx     %o2, [%i0]
6976         add     %i1, 0x8, %i1
6977         subcc   %i3, 0x8, %i3
6978         bgu,pt  %ncc, .ci_alewdcp
6979         add     %i0, 0x8, %i0
6980 
6981 copyin_blalign:
6982         andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
6983         sub     %i2, %i3, %i2           ! Residue bytes in %i2
6984 
6985         mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
6986 
6987         andcc   %i1, 0xf, %o2           ! is src quadword aligned
6988         bz,pn   %xcc, .ci_blkcpy        ! src offset in %o2 (last 4-bits)
6989         nop
6990         cmp     %o2, 0x8
6991         bg      .ci_upper_double
6992         nop
6993         bl      .ci_lower_double
6994         nop
6995 
6996         ! Falls through when source offset is equal to 8 i.e.
6997         ! source is double word aligned.
6998         ! In this case no shift/merge of data is required
6999 
7000         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
7001         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
7002         prefetcha [%l0]ASI_USER, #one_read
7003         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7004         add     %l0, 0x40, %l0
7005 .ci_loop0:
7006         add     %i1, 0x10, %i1
7007         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7008 
7009         prefetcha [%l0]ASI_USER, #one_read
7010 
7011         stxa    %l3, [%i0+0x0]%asi
7012         stxa    %l4, [%i0+0x8]%asi
7013 
7014         add     %i1, 0x10, %i1
7015         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7016 
7017         stxa    %l5, [%i0+0x10]%asi
7018         stxa    %l2, [%i0+0x18]%asi
7019 
7020         add     %i1, 0x10, %i1
7021         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7022 
7023         stxa    %l3, [%i0+0x20]%asi
7024         stxa    %l4, [%i0+0x28]%asi
7025 
7026         add     %i1, 0x10, %i1
7027         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7028 
7029         stxa    %l5, [%i0+0x30]%asi
7030         stxa    %l2, [%i0+0x38]%asi
7031 
7032         add     %l0, 0x40, %l0
7033         subcc   %i3, 0x40, %i3
7034         bgu,pt  %xcc, .ci_loop0
7035         add     %i0, 0x40, %i0
7036         ba      .ci_blkdone
7037         add     %i1, %o2, %i1           ! increment the source by src offset
7038                                         ! the src offset was stored in %o2
7039 
7040 .ci_lower_double:
7041 
7042         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
7043         sll     %o2, 3, %o0             ! %o0 left shift
7044         mov     0x40, %o1
7045         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
7046         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
7047         prefetcha [%l0]ASI_USER, #one_read
7048         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2    ! partial data in %l2
7049                                                         ! and %l3 has complete
7050                                                         ! data
7051         add     %l0, 0x40, %l0
7052 .ci_loop1:
7053         add     %i1, 0x10, %i1
7054         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4    ! %l4 has partial data
7055                                                         ! for this read.
7056         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
7057                                                         ! into %l2 and %l3
7058 
7059         prefetcha [%l0]ASI_USER, #one_read
7060 
7061         stxa    %l2, [%i0+0x0]%asi
7062         stxa    %l3, [%i0+0x8]%asi
7063 
7064         add     %i1, 0x10, %i1
7065         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7066         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
7067                                                         ! %l4 from previous read
7068                                                         ! into %l4 and %l5
7069         stxa    %l4, [%i0+0x10]%asi
7070         stxa    %l5, [%i0+0x18]%asi
7071 
7072         ! Repeat the same for next 32 bytes.
7073 
7074         add     %i1, 0x10, %i1
7075         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7076         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
7077 
7078         stxa    %l2, [%i0+0x20]%asi
7079         stxa    %l3, [%i0+0x28]%asi
7080 
7081         add     %i1, 0x10, %i1
7082         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7083         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
7084 
7085         stxa    %l4, [%i0+0x30]%asi
7086         stxa    %l5, [%i0+0x38]%asi
7087 
7088         add     %l0, 0x40, %l0
7089         subcc   %i3, 0x40, %i3
7090         bgu,pt  %xcc, .ci_loop1
7091         add     %i0, 0x40, %i0
7092         ba      .ci_blkdone
7093         add     %i1, %o2, %i1           ! increment the source by src offset
7094                                         ! the src offset was stored in %o2
7095 
7096 .ci_upper_double:
7097 
7098         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
7099         sub     %o2, 0x8, %o0
7100         sll     %o0, 3, %o0             ! %o0 left shift
7101         mov     0x40, %o1
7102         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
7103         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
7104         prefetcha [%l0]ASI_USER, #one_read
7105         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2    ! partial data in %l3
7106                                                         ! for this read and
7107                                                         ! no data in %l2
7108         add     %l0, 0x40, %l0
7109 .ci_loop2:
7110         add     %i1, 0x10, %i1
7111         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4    ! %l4 has complete data
7112                                                         ! and %l5 has partial
7113         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
7114                                                         ! into %l3 and %l4
7115         prefetcha [%l0]ASI_USER, #one_read
7116 
7117         stxa    %l3, [%i0+0x0]%asi
7118         stxa    %l4, [%i0+0x8]%asi
7119 
7120         add     %i1, 0x10, %i1
7121         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7122         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
7123                                                         ! %l5 from previous read
7124                                                         ! into %l5 and %l2
7125 
7126         stxa    %l5, [%i0+0x10]%asi
7127         stxa    %l2, [%i0+0x18]%asi
7128 
7129         ! Repeat the same for next 32 bytes.
7130 
7131         add     %i1, 0x10, %i1
7132         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7133         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
7134 
7135         stxa    %l3, [%i0+0x20]%asi
7136         stxa    %l4, [%i0+0x28]%asi
7137 
7138         add     %i1, 0x10, %i1
7139         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7140         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
7141 
7142         stxa    %l5, [%i0+0x30]%asi
7143         stxa    %l2, [%i0+0x38]%asi
7144 
7145         add     %l0, 0x40, %l0
7146         subcc   %i3, 0x40, %i3
7147         bgu,pt  %xcc, .ci_loop2
7148         add     %i0, 0x40, %i0
7149         ba      .ci_blkdone
7150         add     %i1, %o2, %i1           ! increment the source by src offset
7151                                         ! the src offset was stored in %o2
7152 
7153 
7154         ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
7155 .ci_blkcpy:
7156 
7157         andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
7158         prefetcha [%o0]ASI_USER, #one_read
7159         add     %o0, 0x40, %o0
7160 1:
7161         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
7162         add     %i1, 0x10, %i1
7163         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7164         add     %i1, 0x10, %i1
7165 
7166         prefetcha [%o0]ASI_USER, #one_read
7167 
7168         stxa    %l0, [%i0+0x0]%asi
7169 
7170         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7171         add     %i1, 0x10, %i1
7172         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
7173         add     %i1, 0x10, %i1
7174 
7175         stxa    %l1, [%i0+0x8]%asi
7176         stxa    %l2, [%i0+0x10]%asi
7177         stxa    %l3, [%i0+0x18]%asi
7178         stxa    %l4, [%i0+0x20]%asi
7179         stxa    %l5, [%i0+0x28]%asi
7180         stxa    %l6, [%i0+0x30]%asi
7181         stxa    %l7, [%i0+0x38]%asi
7182 
7183         add     %o0, 0x40, %o0
7184         subcc   %i3, 0x40, %i3
7185         bgu,pt  %xcc, 1b
7186         add     %i0, 0x40, %i0
7187 
7188 .ci_blkdone:
7189         membar  #Sync
7190 
7191         brz,pt  %i2, .copyin_exit
7192         nop
7193 
7194         ! Handle trailing bytes
7195         cmp     %i2, 0x8
7196         blu,pt  %ncc, .ci_residue
7197         nop
7198 
7199         ! Can we do some 8B ops
7200         or      %i1, %i0, %o2
7201         andcc   %o2, 0x7, %g0
7202         bnz     %ncc, .ci_last4
7203         nop
7204 
7205         ! Do 8byte ops as long as possible
7206 .ci_last8:
7207         ldxa    [%i1]ASI_USER, %o2
7208         stx     %o2, [%i0]
7209         add     %i1, 0x8, %i1
7210         sub     %i2, 0x8, %i2
7211         cmp     %i2, 0x8
7212         bgu,pt  %ncc, .ci_last8
7213         add     %i0, 0x8, %i0
7214 
7215         brz,pt  %i2, .copyin_exit
7216         nop
7217 
7218         ba      .ci_residue
7219         nop
7220 
7221 .ci_last4:
7222         ! Can we do 4B ops
7223         andcc   %o2, 0x3, %g0
7224         bnz     %ncc, .ci_last2
7225         nop
7226 1:
7227         lda     [%i1]ASI_USER, %o2
7228         st      %o2, [%i0]
7229         add     %i1, 0x4, %i1
7230         sub     %i2, 0x4, %i2
7231         cmp     %i2, 0x4
7232         bgu,pt  %ncc, 1b
7233         add     %i0, 0x4, %i0
7234 
7235         brz,pt  %i2, .copyin_exit
7236         nop
7237 
7238         ba      .ci_residue
7239         nop
7240 
7241 .ci_last2:
7242         ! Can we do 2B ops
7243         andcc   %o2, 0x1, %g0
7244         bnz     %ncc, .ci_residue
7245         nop
7246 
7247 1:
7248         lduha   [%i1]ASI_USER, %o2
7249         stuh    %o2, [%i0]
7250         add     %i1, 0x2, %i1
7251         sub     %i2, 0x2, %i2
7252         cmp     %i2, 0x2
7253         bgu,pt  %ncc, 1b
7254         add     %i0, 0x2, %i0
7255 
7256         brz,pt  %i2, .copyin_exit
7257         nop
7258 
7259         ! Copy the residue as byte copy
7260 .ci_residue:
7261         lduba   [%i1]ASI_USER, %i4
7262         stb     %i4, [%i0]
7263         inc     %i1
7264         deccc   %i2
7265         bgu,pt  %xcc, .ci_residue
7266         inc     %i0
7267 
7268 .copyin_exit:
7269         membar  #Sync
7270         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7271         ret
7272         restore %g0, 0, %o0
7273 .copyin_err:
7274         ldn     [THREAD_REG + T_COPYOPS], %o4
7275         brz     %o4, 2f
7276         nop
7277         ldn     [%o4 + CP_COPYIN], %g2
7278         jmp     %g2
7279         nop
7280 2:
7281         retl
7282         mov     -1, %o0
7283 #endif  /* NIAGARA_IMPL */
7284         SET_SIZE(copyin)
7285 
7286 #endif  /* lint */
7287 
7288 #ifdef  lint
7289 
7290 /*ARGSUSED*/
7291 int
7292 xcopyin(const void *uaddr, void *kaddr, size_t count)
7293 { return (0); }
7294 
7295 #else   /* lint */
7296 
7297         ENTRY(xcopyin)
7298         sethi   %hi(.xcopyin_err), REAL_LOFAULT
7299         b       .do_copyin
7300         or      REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
7301 .xcopyin_err:
7302         ldn     [THREAD_REG + T_COPYOPS], %o4
7303         brz     %o4, 2f
7304         nop
7305         ldn     [%o4 + CP_XCOPYIN], %g2
7306         jmp     %g2
7307         nop
7308 2:
7309         retl
7310         mov     %g1, %o0
7311         SET_SIZE(xcopyin)
7312 
7313 #endif  /* lint */
7314 
7315 #ifdef  lint
7316 
7317 /*ARGSUSED*/
7318 int
7319 xcopyin_little(const void *uaddr, void *kaddr, size_t count)
7320 { return (0); }
7321 
7322 #else   /* lint */
7323 
7324         ENTRY(xcopyin_little)
7325         sethi   %hi(.little_err), %o4
7326         ldn     [THREAD_REG + T_LOFAULT], %o5
7327         or      %o4, %lo(.little_err), %o4
7328         membar  #Sync                           ! sync error barrier
7329         stn     %o4, [THREAD_REG + T_LOFAULT]   
7330 
7331         subcc   %g0, %o2, %o3
7332         add     %o0, %o2, %o0
7333         bz,pn   %ncc, 2f                ! check for zero bytes
7334         sub     %o2, 1, %o4
7335         add     %o0, %o4, %o0           ! start w/last byte     
7336         add     %o1, %o2, %o1
7337         lduba   [%o0+%o3]ASI_AIUSL, %o4
7338 
7339 1:      stb     %o4, [%o1+%o3]
7340         inccc   %o3
7341         sub     %o0, 2, %o0             ! get next byte
7342         bcc,a,pt %ncc, 1b
7343         lduba   [%o0+%o3]ASI_AIUSL, %o4
7344 
7345 2:      membar  #Sync                           ! sync error barrier
7346         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
7347         retl
7348         mov     %g0, %o0                ! return (0)
7349 
7350 .little_err:
7351         membar  #Sync                           ! sync error barrier
7352         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
7353         retl
7354         mov     %g1, %o0
7355         SET_SIZE(xcopyin_little)
7356 
7357 #endif  /* lint */
7358 
7359 
7360 /*
7361  * Copy a block of storage - must not overlap (from + len <= to).
7362  * No fault handler installed (to be called under on_fault())
7363  */
7364 #if defined(lint)
7365 
7366 /* ARGSUSED */
7367 void
7368 copyin_noerr(const void *ufrom, void *kto, size_t count)
7369 {}
7370 
7371 #else   /* lint */
7372 
7373         ENTRY(copyin_noerr)
7374         sethi   %hi(.copyio_noerr), REAL_LOFAULT
7375         b       .do_copyin
7376         or      REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7377 .copyio_noerr:
7378         jmp     SAVED_LOFAULT
7379         nop
7380         SET_SIZE(copyin_noerr)
7381 
7382 #endif /* lint */
7383 
7384 /*
7385  * Copy a block of storage - must not overlap (from + len <= to).
7386  * No fault handler installed (to be called under on_fault())
7387  */
7388 
7389 #if defined(lint)
7390 
7391 /* ARGSUSED */
7392 void
7393 copyout_noerr(const void *kfrom, void *uto, size_t count)
7394 {}
7395 
7396 #else   /* lint */
7397 
7398         ENTRY(copyout_noerr)
7399         sethi   %hi(.copyio_noerr), REAL_LOFAULT
7400         b       .do_copyout
7401         or      REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7402         SET_SIZE(copyout_noerr)
7403 
7404 #endif /* lint */
7405 
7406 #if defined(lint)
7407 
7408 int use_hw_bcopy = 1;
7409 int use_hw_bzero = 1;
7410 uint_t hw_copy_limit_1 = 0x100;
7411 uint_t hw_copy_limit_2 = 0x200;
7412 uint_t hw_copy_limit_4 = 0x400;
7413 uint_t hw_copy_limit_8 = 0x400;
7414 
7415 #else /* !lint */
7416 
7417         .align  4
7418         DGDEF(use_hw_bcopy)
7419         .word   1
7420         DGDEF(use_hw_bzero)
7421         .word   1
7422         DGDEF(hw_copy_limit_1)
7423         .word   0x100
7424         DGDEF(hw_copy_limit_2)
7425         .word   0x200
7426         DGDEF(hw_copy_limit_4)
7427         .word   0x400
7428         DGDEF(hw_copy_limit_8)
7429         .word   0x400
7430 
7431         .align  64
7432         .section ".text"
7433 #endif /* !lint */
7434 
7435 /*
7436  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
7437  * longer than 256 bytes in length using Niagara's block stores/quad store.
7438  * If the criteria for using this routine are not met then it calls bzero
7439  * and returns 1.  Otherwise 0 is returned indicating success.
7440  * Caller is responsible for ensuring use_hw_bzero is true and that
7441  * kpreempt_disable() has been called.
7442  */
7443 #ifdef lint
7444 /*ARGSUSED*/
7445 int
7446 hwblkclr(void *addr, size_t len)
7447 { 
7448         return(0);
7449 }
7450 #else /* lint */
7451         ! %i0 - start address
7452         ! %i1 - length of region (multiple of 64)
7453 
7454         ENTRY(hwblkclr)
7455         save    %sp, -SA(MINFRAME), %sp
7456 
7457         ! Must be block-aligned
7458         andcc   %i0, 0x3f, %g0
7459         bnz,pn  %ncc, 1f
7460         nop
7461 
7462         ! ... and must be 256 bytes or more
7463         cmp     %i1, 0x100
7464         blu,pn  %ncc, 1f
7465         nop
7466 
7467         ! ... and length must be a multiple of 64
7468         andcc   %i1, 0x3f, %g0
7469         bz,pn   %ncc, .pz_doblock
7470         mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7471 
7472 1:      ! punt, call bzero but notify the caller that bzero was used
7473         mov     %i0, %o0
7474         call    bzero
7475         mov     %i1, %o1
7476         ret
7477         restore %g0, 1, %o0     ! return (1) - did not use block operations
7478 
7479         ! Already verified that there are at least 256 bytes to set
7480 .pz_doblock:
7481         stxa    %g0, [%i0+0x0]%asi
7482         stxa    %g0, [%i0+0x40]%asi
7483         stxa    %g0, [%i0+0x80]%asi
7484         stxa    %g0, [%i0+0xc0]%asi
7485 
7486         stxa    %g0, [%i0+0x8]%asi
7487         stxa    %g0, [%i0+0x10]%asi
7488         stxa    %g0, [%i0+0x18]%asi
7489         stxa    %g0, [%i0+0x20]%asi
7490         stxa    %g0, [%i0+0x28]%asi
7491         stxa    %g0, [%i0+0x30]%asi
7492         stxa    %g0, [%i0+0x38]%asi
7493 
7494         stxa    %g0, [%i0+0x48]%asi
7495         stxa    %g0, [%i0+0x50]%asi
7496         stxa    %g0, [%i0+0x58]%asi
7497         stxa    %g0, [%i0+0x60]%asi
7498         stxa    %g0, [%i0+0x68]%asi
7499         stxa    %g0, [%i0+0x70]%asi
7500         stxa    %g0, [%i0+0x78]%asi
7501 
7502         stxa    %g0, [%i0+0x88]%asi
7503         stxa    %g0, [%i0+0x90]%asi
7504         stxa    %g0, [%i0+0x98]%asi
7505         stxa    %g0, [%i0+0xa0]%asi
7506         stxa    %g0, [%i0+0xa8]%asi
7507         stxa    %g0, [%i0+0xb0]%asi
7508         stxa    %g0, [%i0+0xb8]%asi
7509 
7510         stxa    %g0, [%i0+0xc8]%asi
7511         stxa    %g0, [%i0+0xd0]%asi
7512         stxa    %g0, [%i0+0xd8]%asi
7513         stxa    %g0, [%i0+0xe0]%asi
7514         stxa    %g0, [%i0+0xe8]%asi
7515         stxa    %g0, [%i0+0xf0]%asi
7516         stxa    %g0, [%i0+0xf8]%asi
7517 
7518         sub     %i1, 0x100, %i1
7519         cmp     %i1, 0x100
7520         bgu,pt  %ncc, .pz_doblock
7521         add     %i0, 0x100, %i0
7522 
7523 2:
7524         ! Check if more than 64 bytes to set
7525         cmp     %i1,0x40
7526         blu     %ncc, .pz_finish
7527         nop
7528 
7529 3:
7530         stxa    %g0, [%i0+0x0]%asi
7531         stxa    %g0, [%i0+0x8]%asi
7532         stxa    %g0, [%i0+0x10]%asi
7533         stxa    %g0, [%i0+0x18]%asi
7534         stxa    %g0, [%i0+0x20]%asi
7535         stxa    %g0, [%i0+0x28]%asi
7536         stxa    %g0, [%i0+0x30]%asi
7537         stxa    %g0, [%i0+0x38]%asi
7538 
7539         subcc   %i1, 0x40, %i1
7540         bgu,pt  %ncc, 3b
7541         add     %i0, 0x40, %i0
7542 
7543 .pz_finish:
7544         membar  #Sync
7545         ret
7546         restore %g0, 0, %o0             ! return (bzero or not)
7547         SET_SIZE(hwblkclr)
7548 #endif  /* lint */
7549 
7550 #ifdef  lint
7551 /* Copy 32 bytes of data from src to dst using physical addresses */
7552 /*ARGSUSED*/
7553 void
7554 hw_pa_bcopy32(uint64_t src, uint64_t dst)
7555 {}
7556 #else   /*!lint */
7557 
7558         /*
7559          * Copy 32 bytes of data from src (%o0) to dst (%o1)
7560          * using physical addresses.
7561          */
7562         ENTRY_NP(hw_pa_bcopy32)
7563         rdpr    %pstate, %g1
7564         andn    %g1, PSTATE_IE, %g2
7565         wrpr    %g0, %g2, %pstate
7566 
7567         ldxa    [%o0]ASI_MEM, %o2
7568         add     %o0, 8, %o0
7569         ldxa    [%o0]ASI_MEM, %o3
7570         add     %o0, 8, %o0
7571         ldxa    [%o0]ASI_MEM, %o4
7572         add     %o0, 8, %o0
7573         ldxa    [%o0]ASI_MEM, %o5
7574         stxa    %o2, [%o1]ASI_MEM
7575         add     %o1, 8, %o1
7576         stxa    %o3, [%o1]ASI_MEM
7577         add     %o1, 8, %o1
7578         stxa    %o4, [%o1]ASI_MEM
7579         add     %o1, 8, %o1
7580         stxa    %o5, [%o1]ASI_MEM
7581 
7582         membar  #Sync
7583         retl
7584         wrpr    %g0, %g1, %pstate
7585         SET_SIZE(hw_pa_bcopy32)
7586 #endif /* lint */
7587 
7588 /*
7589  * Zero a block of storage.
7590  *
7591  * uzero is used by the kernel to zero a block in user address space.
7592  */
7593 
7594 /*
7595  * Control flow of the bzero/kzero/uzero routine.
7596  *
7597  *      For fewer than 7 bytes stores, bytes will be zeroed.
7598  *
7599  *      For less than 15 bytes stores, align the address on 4 byte boundary.
7600  *      Then store as many 4-byte chunks, followed by trailing bytes.
7601  *
7602  *      For sizes greater than 15 bytes, align the address on 8 byte boundary.
7603  *      if (count > 128) {
7604  *              store as many 8-bytes chunks to block align the address
7605  *              store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
7606  *              store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
7607  *      }
7608  *      Store as many 8-byte chunks, followed by trailing bytes.
7609  */
7610 
7611 #if defined(lint)
7612 
7613 /* ARGSUSED */
7614 int
7615 kzero(void *addr, size_t count)
7616 { return(0); }
7617 
7618 /* ARGSUSED */
7619 void
7620 uzero(void *addr, size_t count)
7621 {}
7622 
7623 #else   /* lint */
7624 
7625         ENTRY(uzero)
7626         !
7627         ! Set a new lo_fault handler only if we came in with one
7628         ! already specified.
7629         !
7630         wr      %g0, ASI_USER, %asi
7631         ldn     [THREAD_REG + T_LOFAULT], %o5
7632         tst     %o5
7633         bz,pt   %ncc, .do_zero
7634         sethi   %hi(.zeroerr), %o2
7635         or      %o2, %lo(.zeroerr), %o2
7636         membar  #Sync
7637         ba,pt   %ncc, .do_zero
7638         stn     %o2, [THREAD_REG + T_LOFAULT]
7639 
7640         ENTRY(kzero)
7641         !
7642         ! Always set a lo_fault handler
7643         !
7644         wr      %g0, ASI_P, %asi
7645         ldn     [THREAD_REG + T_LOFAULT], %o5
7646         sethi   %hi(.zeroerr), %o2
7647         or      %o5, LOFAULT_SET, %o5
7648         or      %o2, %lo(.zeroerr), %o2
7649         membar  #Sync
7650         ba,pt   %ncc, .do_zero
7651         stn     %o2, [THREAD_REG + T_LOFAULT]
7652 
7653 /*
7654  * We got here because of a fault during kzero or if
7655  * uzero or bzero was called with t_lofault non-zero.
7656  * Otherwise we've already run screaming from the room.
7657  * Errno value is in %g1. Note that we're here iff
7658  * we did set t_lofault.
7659  */
7660 .zeroerr:
7661         !
7662         ! Undo asi register setting. Just set it to be the
7663         ! kernel default without checking.
7664         !
7665         wr      %g0, ASI_P, %asi
7666 
7667         !
7668         ! We did set t_lofault. It may well have been zero coming in.
7669         !
7670 1:
7671         tst     %o5
7672         membar #Sync
7673         bne,pn  %ncc, 3f                
7674         andncc  %o5, LOFAULT_SET, %o5
7675 2:
7676         !
7677         ! Old handler was zero. Just return the error.
7678         !
7679         retl                            ! return
7680         mov     %g1, %o0                ! error code from %g1
7681 3:
7682         !
7683         ! We're here because %o5 was non-zero. It was non-zero
7684         ! because either LOFAULT_SET was present, a previous fault
7685         ! handler was present or both. In all cases we need to reset
7686         ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
7687         ! before we either simply return the error or we invoke the
7688         ! previously specified handler.
7689         !
7690         be      %ncc, 2b
7691         stn     %o5, [THREAD_REG + T_LOFAULT]
7692         jmp     %o5                     ! goto real handler
7693         nop
7694         SET_SIZE(kzero)
7695         SET_SIZE(uzero)
7696 
7697 #endif  /* lint */
7698 
7699 /*
7700  * Zero a block of storage.
7701  */
7702 
7703 #if defined(lint)
7704 
7705 /* ARGSUSED */
7706 void
7707 bzero(void *addr, size_t count)
7708 {}
7709 
7710 #else   /* lint */
7711 
7712         ENTRY(bzero)
7713         wr      %g0, ASI_P, %asi
7714 
7715         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save old vector
7716         tst     %o5
7717         bz,pt   %ncc, .do_zero
7718         sethi   %hi(.zeroerr), %o2
7719         or      %o2, %lo(.zeroerr), %o2
7720         membar  #Sync                           ! sync error barrier
7721         stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
7722 
7723 .do_zero:
7724         cmp     %o1, 7
7725         blu,pn  %ncc, .byteclr
7726         nop
7727 
7728         cmp     %o1, 15
7729         blu,pn  %ncc, .wdalign
7730         nop
7731 
7732         andcc   %o0, 7, %o3             ! is add aligned on a 8 byte bound
7733         bz,pt   %ncc, .blkalign         ! already double aligned
7734         sub     %o3, 8, %o3             ! -(bytes till double aligned)
7735         add     %o1, %o3, %o1           ! update o1 with new count
7736 
7737 1:
7738         stba    %g0, [%o0]%asi
7739         inccc   %o3
7740         bl,pt   %ncc, 1b
7741         inc     %o0
7742 
7743         ! Now address is double aligned
7744 .blkalign:
7745         cmp     %o1, 0x80               ! check if there are 128 bytes to set
7746         blu,pn  %ncc, .bzero_small
7747         mov     %o1, %o3
7748 
7749         sethi   %hi(use_hw_bzero), %o2
7750         ld      [%o2 + %lo(use_hw_bzero)], %o2
7751         tst     %o2
7752         bz      %ncc, .bzero_small
7753         mov     %o1, %o3
7754 
7755         rd      %asi, %o3
7756         wr      %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7757         cmp     %o3, ASI_P
7758         bne,a   %ncc, .algnblk
7759         wr      %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
7760 
7761 .algnblk:
7762         andcc   %o0, 0x3f, %o3          ! is block aligned?
7763         bz,pt   %ncc, .bzero_blk
7764         sub     %o3, 0x40, %o3          ! -(bytes till block aligned)
7765         add     %o1, %o3, %o1           ! o1 is the remainder
7766         
7767         ! Clear -(%o3) bytes till block aligned
7768 1:
7769         stxa    %g0, [%o0]%asi
7770         addcc   %o3, 8, %o3
7771         bl,pt   %ncc, 1b
7772         add     %o0, 8, %o0
7773 
7774 .bzero_blk:
7775         and     %o1, 0x3f, %o3          ! calc bytes left after blk clear
7776         andn    %o1, 0x3f, %o4          ! calc size of blocks in bytes
7777 
7778         cmp     %o4, 0x100              ! 256 bytes or more
7779         blu,pn  %ncc, 3f
7780         nop
7781 
7782 2:
7783         stxa    %g0, [%o0+0x0]%asi
7784         stxa    %g0, [%o0+0x40]%asi
7785         stxa    %g0, [%o0+0x80]%asi
7786         stxa    %g0, [%o0+0xc0]%asi
7787 
7788         stxa    %g0, [%o0+0x8]%asi
7789         stxa    %g0, [%o0+0x10]%asi
7790         stxa    %g0, [%o0+0x18]%asi
7791         stxa    %g0, [%o0+0x20]%asi
7792         stxa    %g0, [%o0+0x28]%asi
7793         stxa    %g0, [%o0+0x30]%asi
7794         stxa    %g0, [%o0+0x38]%asi
7795 
7796         stxa    %g0, [%o0+0x48]%asi
7797         stxa    %g0, [%o0+0x50]%asi
7798         stxa    %g0, [%o0+0x58]%asi
7799         stxa    %g0, [%o0+0x60]%asi
7800         stxa    %g0, [%o0+0x68]%asi
7801         stxa    %g0, [%o0+0x70]%asi
7802         stxa    %g0, [%o0+0x78]%asi
7803 
7804         stxa    %g0, [%o0+0x88]%asi
7805         stxa    %g0, [%o0+0x90]%asi
7806         stxa    %g0, [%o0+0x98]%asi
7807         stxa    %g0, [%o0+0xa0]%asi
7808         stxa    %g0, [%o0+0xa8]%asi
7809         stxa    %g0, [%o0+0xb0]%asi
7810         stxa    %g0, [%o0+0xb8]%asi
7811 
7812         stxa    %g0, [%o0+0xc8]%asi
7813         stxa    %g0, [%o0+0xd0]%asi
7814         stxa    %g0, [%o0+0xd8]%asi
7815         stxa    %g0, [%o0+0xe0]%asi
7816         stxa    %g0, [%o0+0xe8]%asi
7817         stxa    %g0, [%o0+0xf0]%asi
7818         stxa    %g0, [%o0+0xf8]%asi
7819 
7820         sub     %o4, 0x100, %o4
7821         cmp     %o4, 0x100
7822         bgu,pt  %ncc, 2b
7823         add     %o0, 0x100, %o0
7824 
7825 3:
7826         ! ... check if 64 bytes to set
7827         cmp     %o4, 0x40
7828         blu     %ncc, .bzero_blk_done
7829         nop
7830 
7831 4:
7832         stxa    %g0, [%o0+0x0]%asi
7833         stxa    %g0, [%o0+0x8]%asi
7834         stxa    %g0, [%o0+0x10]%asi
7835         stxa    %g0, [%o0+0x18]%asi
7836         stxa    %g0, [%o0+0x20]%asi
7837         stxa    %g0, [%o0+0x28]%asi
7838         stxa    %g0, [%o0+0x30]%asi
7839         stxa    %g0, [%o0+0x38]%asi
7840 
7841         subcc   %o4, 0x40, %o4
7842         bgu,pt  %ncc, 3b
7843         add     %o0, 0x40, %o0
7844 
7845 .bzero_blk_done:
7846         membar  #Sync
7847         !
7848         ! Undo asi register setting.
7849         !
7850         rd      %asi, %o4
7851         wr      %g0, ASI_P, %asi
7852         cmp     %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
7853         bne,a   %ncc, .bzero_small
7854         wr      %g0, ASI_USER, %asi
7855 
7856 .bzero_small:
7857         ! Set the remaining doubles
7858         subcc   %o3, 8, %o3             ! Can we store any doubles?
7859         blu,pn  %ncc, .byteclr
7860         and     %o1, 7, %o1             ! calc bytes left after doubles
7861 
7862 .dbclr:
7863         stxa    %g0, [%o0]%asi          ! Clear the doubles
7864         subcc   %o3, 8, %o3
7865         bgeu,pt %ncc, .dbclr
7866         add     %o0, 8, %o0
7867 
7868         ba      .byteclr
7869         nop
7870 
7871 .wdalign:                       
7872         andcc   %o0, 3, %o3             ! is add aligned on a word boundary
7873         bz,pn   %ncc, .wdclr
7874         andn    %o1, 3, %o3             ! create word sized count in %o3
7875 
7876         dec     %o1                     ! decrement count
7877         stba    %g0, [%o0]%asi          ! clear a byte
7878         ba      .wdalign
7879         inc     %o0                     ! next byte
7880 
7881 .wdclr:
7882         sta     %g0, [%o0]%asi          ! 4-byte clearing loop
7883         subcc   %o3, 4, %o3
7884         bnz,pt  %ncc, .wdclr
7885         inc     4, %o0
7886 
7887         and     %o1, 3, %o1             ! leftover count, if any
7888 
7889 .byteclr:
7890         ! Set the leftover bytes
7891         brz     %o1, .bzero_exit
7892         nop
7893 
7894 7:
7895         deccc   %o1                     ! byte clearing loop
7896         stba    %g0, [%o0]%asi
7897         bgu,pt  %ncc, 7b
7898         inc     %o0
7899 
7900 .bzero_exit:
7901         !
7902         ! We're just concerned with whether t_lofault was set
7903         ! when we came in. We end up here from either kzero()
7904         ! or bzero(). kzero() *always* sets a lofault handler.
7905         ! It ors LOFAULT_SET into %o5 to indicate it has done
7906         ! this even if the value of %o5 is otherwise zero.
7907         ! bzero() sets a lofault handler *only* if one was
7908         ! previously set. Accordingly we need to examine
7909         ! %o5 and if it is non-zero be sure to clear LOFAULT_SET
7910         ! before resetting the error handler.
7911         !
7912         tst     %o5
7913         bz      %ncc, 1f
7914         andn    %o5, LOFAULT_SET, %o5
7915         membar  #Sync                           ! sync error barrier
7916         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
7917 1:
7918         retl
7919         clr     %o0                     ! return (0)
7920 
7921         SET_SIZE(bzero)
7922 #endif  /* lint */