1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 
  26 #include <sys/param.h>
  27 #include <sys/errno.h>
  28 #include <sys/asm_linkage.h>
  29 #include <sys/vtrace.h>
  30 #include <sys/machthread.h>
  31 #include <sys/clock.h>
  32 #include <sys/asi.h>
  33 #include <sys/fsr.h>
  34 #include <sys/privregs.h>
  35 #include <sys/machasi.h>
  36 #include <sys/niagaraasi.h>
  37 
  38 #include "assym.h"
  39 
  40 
  41 /*
  42  * Pseudo-code to aid in understanding the control flow of the
  43  * bcopy/kcopy routine.
  44  *
  45  *      ! WARNING : <Register usage convention>
  46  *      ! In kcopy() the %o5, holds previous error handler and a flag
  47  *      ! LOFAULT_SET (low bits). The %o5 is null in bcopy().
  48  *      ! The %o5 is not available for any other use.
  49  *
  50  * On entry:
  51  *      ! Determine whether to use the FP register version or the
  52  *      ! the leaf routine version depending on the size of the copy.
  53  *      ! Set up error handling accordingly.
  54  *      ! The transition point depends on FP_COPY
  55  *      ! For both versions %o5 is reserved
  56  *
  57  * kcopy():
  58  *      if(length > FP_COPY)
  59  *              go to regular_kcopy
  60  *
  61  *      ! Setup_leaf_rtn_error_handler
  62  *      %o5 = curthread->t_lofault;          ! save existing handler in %o5
  63  *      %o5 |= LOFAULT_SET;                     ! ORed with LOFAULT_SET flag
  64  *      curthread->t_lofault = .sm_copyerr;
  65  *      goto small_bcopy();
  66  *
  67  * regular_kcopy:
  68  *      save_registers()
  69  *      %o5 = curthread->t_lofault;          ! save existing handler in %o5
  70  *      %o5 |= LOFAULT_SET;                     ! ORed with LOFAULT_SET flag
  71  *      curthread->t_lofault = .copyerr;
  72  *      goto do_copy();
  73  *
  74  * bcopy():
  75  *      if(length > FP_COPY)
  76  *              go to regular_bcopy
  77  *
  78  *      ! Setup_leaf_rtn_error_handler
  79  *      %o5 = curthread->t_lofault;          ! save existing handler in %o5
  80  *      curthread->t_lofault = .sm_copyerr;
  81  *      goto small_bcopy();
  82  *
  83  * regular_bcopy:
  84  *      %o5 = curthread->t_lofault;          ! save existing handler in %o5
  85  *      curthread->t_lofault = .copyerr;
  86  *      goto do_copy();
  87  *
  88  * small_bcopy:
  89  *      ! handle copies smaller than FP_COPY
  90  *      restore t_lofault handler
  91  *      exit
  92  *
  93  * do_copy:
  94  *      ! handle copies larger than FP_COPY
  95  *      save fp_regs
  96  *      blockcopy;
  97  *      restore fp_regs
  98  *      restore t_lofault handler if came from kcopy();
  99  *
 100  *
 101  * In leaf lofault handler:
 102  *      curthread->t_lofault = (%o5 & ~LOFAULT_SET);     ! restore old t_lofault
 103  *      return (errno)
 104  *
 105  * In lofault handler:
 106  *      curthread->t_lofault = (%o5 & ~LOFAULT_SET);     ! restore old t_lofault
 107  *      restore fp_regs
 108  *      return (errno)
 109  *
 110  *
 111  *
 112  * For all of bcopy/copyin/copyout the copy logic is specialized according
 113  * to how the src and dst is aligned and how much data needs to be moved.
 114  * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
 115  * 
 116  * N2/RF Flow :
 117  *
 118  * if (count < FP_COPY) {  (584 bytes)
 119  *   set small fault handler (no register window save/restore)
 120  *   if count < SHORTCOPY  (7 bytes)
 121  *      copy bytes; go to short_exit
 122  *   else
 123  *   determine dst alignment, move minimum bytes/halfwords to
 124  *   get dst aligned on long word boundary
 125  *     if( src is on long word boundary ) {
 126  * medlong:                                        src/dst aligned on 8 bytes
 127  *       copy with ldx/stx in 4-way unrolled loop;
 128  *       copy final 0-31 bytes; go to short_exit
 129  *     } else {                                 src/dst not aligned on 8 bytes
 130  *     if src is word aligned, ld/st words in 32-byte chunks
 131  *     if src is half word aligned, ld half, ld word, ld half; pack 
 132  *              into long word, store long words in 32-byte chunks
 133  *     if src is byte aligned, ld byte,half,word parts;  pack into long
 134  *         word, store long words in 32-byte chunks
 135  *     move final 0-31 bytes according to src alignment;  go to short_exit
 136  * short_exit:
 137  *     restore trap handler if needed, retl
 138  * else {                                          More than FP_COPY bytes
 139  *     set fault handler
 140  *     disable kernel preemption
 141  *     save registers, save FP registers if in use
 142  *     move bytes to align destination register on long word boundary
 143  *     if(src is on long word boundary) {          src/dst aligned on 8 bytes
 144  *       align dst on 64 byte boundary;  use 8-way test for each of 8 possible
 145  *       src alignments relative to a 64 byte boundary to select the
 146  *       16-way unrolled loop (128 bytes) to use for
 147  *       block load, fmovd, block-init-store, block-store, fmovd operations
 148  *       then go to remain_stuff.
 149  * remain_stuff: move remaining bytes. go to long_exit
 150  *     } else {
 151  *       setup alignaddr for faligndata instructions
 152  *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
 153  *       src alignments to nearest long word relative to 64 byte boundary to
 154  *       select the 8-way unrolled loop (64 bytes) to use for
 155  *       block load, falign, fmovd, block-store loop
 156  *       (only use block-init-store when src/dst on 8 byte boundaries.)
 157  *       goto unalign_done.
 158  * unalign_done:
 159  *       move remaining bytes for unaligned cases. go to long_exit
 160  * long_exit:
 161  *       restore %gsr, FP regs (either from stack or set to zero),
 162  *       restore trap handler, check for kernel preemption request,
 163  *       handle if needed, ret.
 164  * }
 165  *
 166  * Other platforms include hw_bcopy_limit_[1248] to control the exact
 167  * point where the FP register code is used. On those platforms, the
 168  * FP register code did not leave data in L2 cache, potentially affecting
 169  * performance more than the gain/loss from the algorithm difference.
 170  * For N2/RF, block store places data in the L2 cache, so use or non-use
 171  * of the FP registers has no effect on L2 cache behavior.
 172  * The cost for testing hw_bcopy_limit_* according to different
 173  * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
 174  * were not used. That cost was judged too high relative to the benefits,
 175  * so the hw_bcopy_limit option is omitted from this code.
 176  */
 177 
 178 /*
 179  * Less then or equal this number of bytes we will always copy byte-for-byte
 180  */
 181 #define SMALL_LIMIT     7
 182 
 183 /*
 184  * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
 185  * handler was set
 186  */
 187 #define LOFAULT_SET 2
 188 
 189 /*
 190  * This define is to align data for the unaligned source cases.
 191  * The data1, data2 and data3 is merged into data1 and data2.
 192  * The data3 is preserved for next merge.
 193  */
 194 #define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)    \
 195         sllx    data1, lshift, data1                            ;\
 196         srlx    data2, rshift, tmp                              ;\
 197         or      data1, tmp, data1                               ;\
 198         sllx    data2, lshift, data2                            ;\
 199         srlx    data3, rshift, tmp                              ;\
 200         or      data2, tmp, data2
 201 /*
 202  * This macro is to align the data. Basically it merges
 203  * data1 and data2 to form double word.
 204  */
 205 #define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)        \
 206         sllx    data1, lshift, data1                            ;\
 207         srlx    data2, rshift, tmp                              ;\
 208         or      data1, tmp, data1
 209 
 210 #if !defined(NIAGARA_IMPL)
 211 /*
 212  * Flags set in the lower bits of the t_lofault address:
 213  * FPUSED_FLAG: The FP registers were in use and must be restored
 214  * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
 215  * COPY_FLAGS: Both of the above
 216  *
 217  * Other flags:
 218  * KPREEMPT_FLAG: kpreempt needs to be called
 219  */
 220 #define FPUSED_FLAG     1
 221 #define LOFAULT_SET     2
 222 #define COPY_FLAGS      (FPUSED_FLAG | LOFAULT_SET)
 223 #define KPREEMPT_FLAG   4
 224 
 225 #define ALIGN_OFF_1_7                   \
 226         faligndata %d0, %d2, %d48       ;\
 227         faligndata %d2, %d4, %d50       ;\
 228         faligndata %d4, %d6, %d52       ;\
 229         faligndata %d6, %d8, %d54       ;\
 230         faligndata %d8, %d10, %d56      ;\
 231         faligndata %d10, %d12, %d58     ;\
 232         faligndata %d12, %d14, %d60     ;\
 233         faligndata %d14, %d16, %d62
 234 
 235 #define ALIGN_OFF_8_15                  \
 236         faligndata %d2, %d4, %d48       ;\
 237         faligndata %d4, %d6, %d50       ;\
 238         faligndata %d6, %d8, %d52       ;\
 239         faligndata %d8, %d10, %d54      ;\
 240         faligndata %d10, %d12, %d56     ;\
 241         faligndata %d12, %d14, %d58     ;\
 242         faligndata %d14, %d16, %d60     ;\
 243         faligndata %d16, %d18, %d62
 244 
 245 #define ALIGN_OFF_16_23                 \
 246         faligndata %d4, %d6, %d48       ;\
 247         faligndata %d6, %d8, %d50       ;\
 248         faligndata %d8, %d10, %d52      ;\
 249         faligndata %d10, %d12, %d54     ;\
 250         faligndata %d12, %d14, %d56     ;\
 251         faligndata %d14, %d16, %d58     ;\
 252         faligndata %d16, %d18, %d60     ;\
 253         faligndata %d18, %d20, %d62
 254 
 255 #define ALIGN_OFF_24_31                 \
 256         faligndata %d6, %d8, %d48       ;\
 257         faligndata %d8, %d10, %d50      ;\
 258         faligndata %d10, %d12, %d52     ;\
 259         faligndata %d12, %d14, %d54     ;\
 260         faligndata %d14, %d16, %d56     ;\
 261         faligndata %d16, %d18, %d58     ;\
 262         faligndata %d18, %d20, %d60     ;\
 263         faligndata %d20, %d22, %d62
 264 
 265 #define ALIGN_OFF_32_39                 \
 266         faligndata %d8, %d10, %d48      ;\
 267         faligndata %d10, %d12, %d50     ;\
 268         faligndata %d12, %d14, %d52     ;\
 269         faligndata %d14, %d16, %d54     ;\
 270         faligndata %d16, %d18, %d56     ;\
 271         faligndata %d18, %d20, %d58     ;\
 272         faligndata %d20, %d22, %d60     ;\
 273         faligndata %d22, %d24, %d62
 274 
 275 #define ALIGN_OFF_40_47                 \
 276         faligndata %d10, %d12, %d48     ;\
 277         faligndata %d12, %d14, %d50     ;\
 278         faligndata %d14, %d16, %d52     ;\
 279         faligndata %d16, %d18, %d54     ;\
 280         faligndata %d18, %d20, %d56     ;\
 281         faligndata %d20, %d22, %d58     ;\
 282         faligndata %d22, %d24, %d60     ;\
 283         faligndata %d24, %d26, %d62
 284 
 285 #define ALIGN_OFF_48_55                 \
 286         faligndata %d12, %d14, %d48     ;\
 287         faligndata %d14, %d16, %d50     ;\
 288         faligndata %d16, %d18, %d52     ;\
 289         faligndata %d18, %d20, %d54     ;\
 290         faligndata %d20, %d22, %d56     ;\
 291         faligndata %d22, %d24, %d58     ;\
 292         faligndata %d24, %d26, %d60     ;\
 293         faligndata %d26, %d28, %d62
 294 
 295 #define ALIGN_OFF_56_63                 \
 296         faligndata %d14, %d16, %d48     ;\
 297         faligndata %d16, %d18, %d50     ;\
 298         faligndata %d18, %d20, %d52     ;\
 299         faligndata %d20, %d22, %d54     ;\
 300         faligndata %d22, %d24, %d56     ;\
 301         faligndata %d24, %d26, %d58     ;\
 302         faligndata %d26, %d28, %d60     ;\
 303         faligndata %d28, %d30, %d62
 304 
 305 /*
 306  * FP_COPY indicates the minimum number of bytes needed
 307  * to justify using FP/VIS-accelerated memory operations.
 308  * The FPBLK code assumes a minimum number of bytes are available
 309  * to be moved on entry.  Check that code carefully before
 310  * reducing FP_COPY below 256.
 311  */
 312 #define FP_COPY                 584
 313 #define SHORTCOPY               7
 314 #define ASI_STBI_P              ASI_BLK_INIT_ST_QUAD_LDD_P
 315 #define ASI_STBI_AIUS           ASI_BLK_INIT_QUAD_LDD_AIUS
 316 #define CACHE_LINE              64
 317 #define VIS_BLOCKSIZE           64
 318 
 319 /*
 320  * Size of stack frame in order to accomodate a 64-byte aligned
 321  * floating-point register save area and 2 64-bit temp locations.
 322  * All copy functions use three quadrants of fp registers; to assure a
 323  * block-aligned three block buffer in which to save we must reserve
 324  * four blocks on stack.
 325  *
 326  *    _______________________________________ <-- %fp + STACK_BIAS
 327  *    | We may need to preserve 3 quadrants |
 328  *    | of fp regs, but since we do so with |
 329  *    | BST/BLD we need room in which to    |
 330  *    | align to VIS_BLOCKSIZE bytes.  So   |
 331  *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
 332  *    |-------------------------------------|
 333  *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
 334  *    |-------------------------------------|
 335  *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
 336  *    ---------------------------------------
 337  */
 338 #define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
 339 #define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 4)
 340 #define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 3) + 1)
 341 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
 342 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
 343 
 344 /*
 345  * In FP copies if we do not have preserved data to restore over
 346  * the fp regs we used then we must zero those regs to avoid
 347  * exposing portions of the data to later threads (data security).
 348  */
 349 #define FZERO                           \
 350         fzero   %f0                     ;\
 351         fzero   %f2                     ;\
 352         faddd   %f0, %f2, %f4           ;\
 353         fmuld   %f0, %f2, %f6           ;\
 354         faddd   %f0, %f2, %f8           ;\
 355         fmuld   %f0, %f2, %f10          ;\
 356         faddd   %f0, %f2, %f12          ;\
 357         fmuld   %f0, %f2, %f14          ;\
 358         faddd   %f0, %f2, %f16          ;\
 359         fmuld   %f0, %f2, %f18          ;\
 360         faddd   %f0, %f2, %f20          ;\
 361         fmuld   %f0, %f2, %f22          ;\
 362         faddd   %f0, %f2, %f24          ;\
 363         fmuld   %f0, %f2, %f26          ;\
 364         faddd   %f0, %f2, %f28          ;\
 365         fmuld   %f0, %f2, %f30          ;\
 366         faddd   %f0, %f2, %f48          ;\
 367         fmuld   %f0, %f2, %f50          ;\
 368         faddd   %f0, %f2, %f52          ;\
 369         fmuld   %f0, %f2, %f54          ;\
 370         faddd   %f0, %f2, %f56          ;\
 371         fmuld   %f0, %f2, %f58          ;\
 372         faddd   %f0, %f2, %f60          ;\
 373         fmuld   %f0, %f2, %f62
 374 
 375 /*
 376  * Macros to save and restore fp registers to/from the stack.
 377  * Used to save and restore in-use fp registers when we want to use FP.
 378  */
 379 #define BST_FP_TOSTACK(tmp1)                                    \
 380         /* membar #Sync */                                      ;\
 381         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 382         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 383         stda    %f0, [tmp1]ASI_BLK_P                            ;\
 384         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 385         stda    %f16, [tmp1]ASI_BLK_P                           ;\
 386         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 387         stda    %f48, [tmp1]ASI_BLK_P                           ;\
 388         membar  #Sync
 389 
 390 #define BLD_FP_FROMSTACK(tmp1)                                  \
 391         /* membar #Sync - provided at copy completion */        ;\
 392         add     %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1     ;\
 393         and     tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */    ;\
 394         ldda    [tmp1]ASI_BLK_P, %f0                            ;\
 395         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 396         ldda    [tmp1]ASI_BLK_P, %f16                           ;\
 397         add     tmp1, VIS_BLOCKSIZE, tmp1                       ;\
 398         ldda    [tmp1]ASI_BLK_P, %f48                           ;\
 399         membar  #Sync
 400 
 401 #endif
 402 /*
 403  * Copy a block of storage, returning an error code if `from' or
 404  * `to' takes a kernel pagefault which cannot be resolved.
 405  * Returns errno value on pagefault error, 0 if all ok
 406  */
 407 
 408         .seg    ".text"
 409         .align  4
 410 
 411         ENTRY(kcopy)
 412 #if !defined(NIAGARA_IMPL)
 413         cmp     %o2, FP_COPY                    ! check for small copy/leaf case
 414         bgt,pt  %ncc, .kcopy_more               !
 415         nop
 416 .kcopy_small:                                   ! setup error handler
 417         sethi   %hi(.sm_copyerr), %o4
 418         or      %o4, %lo(.sm_copyerr), %o4      ! .sm_copyerr is lofault value
 419         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
 420         ! Note that we carefully do *not* flag the setting of
 421         ! t_lofault.
 422         membar  #Sync                           ! sync error barrier
 423         b       .sm_do_copy                     ! common code
 424         stn     %o4, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 425 
 426 
 427 .kcopy_more:
 428         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 429         sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
 430         or      %l7, %lo(.copyerr), %l7
 431         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
 432         ! Note that we carefully do *not* flag the setting of
 433         ! t_lofault.
 434         membar  #Sync                           ! sync error barrier
 435         b       .do_copy                        ! common code
 436         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 437 
 438 /*
 439  * We got here because of a fault during a small kcopy or bcopy.
 440  * if a fault handler existed when bcopy was called.
 441  * No floating point registers are used by the small copies.
 442  * Small copies are from a leaf routine
 443  * Errno value is in %g1.
 444  */
 445 .sm_copyerr:
 446         ! The kcopy will always set a t_lofault handler. If it fires,
 447         ! we're expected to just return the error code and not to
 448         ! invoke any existing error handler. As far as bcopy is concerned,
 449         ! we only set t_lofault if there was an existing lofault handler.
 450         ! In that case we're expected to invoke the previously existing
 451         ! handler after resetting the t_lofault value.
 452         btst    LOFAULT_SET, %o5
 453         membar  #Sync                           ! sync error barrier
 454         andn    %o5, LOFAULT_SET, %o5           ! clear fault flag
 455         bnz,pn  %ncc, 3f
 456         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 457         retl
 458         mov     %g1, %o0
 459 3:
 460         ! We're here via bcopy. There must have been an error handler
 461         ! in place otherwise we would have died a nasty death already.
 462         jmp     %o5                             ! goto real handler
 463         mov     %g0, %o0
 464 /*
 465  *  end of .sm_copyerr
 466  */
 467 
 468 /*
 469  * We got here because of a fault during kcopy or bcopy if a fault
 470  * handler existed when bcopy was called.
 471  * stack and fp registers need to be restored
 472  * Errno value is in %g1.
 473  */
 474 .copyerr:
 475         sethi   %hi(.copyerr2), %l1
 476         or      %l1, %lo(.copyerr2), %l1
 477         membar  #Sync                           ! sync error barrier
 478         stn     %l1, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 479         btst    FPUSED_FLAG, %o5
 480         bz,pt   %xcc, 1f
 481         and     %o5, LOFAULT_SET, %l1   ! copy flag to %l1
 482 
 483         membar  #Sync                           ! sync error barrier
 484         wr      %l5, 0, %gsr
 485         btst    FPRS_FEF, %g5
 486         bz,pt   %icc, 4f
 487         nop
 488         ! restore fpregs from stack
 489         BLD_FP_FROMSTACK(%o2)
 490         ba,pt   %ncc, 2f
 491         wr      %g5, 0, %fprs           ! restore fprs
 492 4:
 493         FZERO
 494         wr      %g5, 0, %fprs           ! restore fprs
 495 2:
 496         ldn     [THREAD_REG + T_LWP], %o2
 497         brnz,pt %o2, 1f
 498         nop
 499 
 500         ldsb    [THREAD_REG + T_PREEMPT], %l0
 501         deccc   %l0
 502         bnz,pn  %ncc, 1f
 503         stb     %l0, [THREAD_REG + T_PREEMPT]
 504 
 505         ! Check for a kernel preemption request
 506         ldn     [THREAD_REG + T_CPU], %l0
 507         ldub    [%l0 + CPU_KPRUNRUN], %l0
 508         brnz,a,pt       %l0, 1f ! Need to call kpreempt?
 509         or      %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
 510 
 511         ! The kcopy will always set a t_lofault handler. If it fires,
 512         ! we're expected to just return the error code and not to
 513         ! invoke any existing error handler. As far as bcopy is concerned,
 514         ! we only set t_lofault if there was an existing lofault handler.
 515         ! In that case we're expected to invoke the previously existing
 516         ! handler after resetting the t_lofault value.
 517 1:
 518         andn    %o5, COPY_FLAGS, %o5    ! remove flags from lofault address
 519         membar  #Sync                           ! sync error barrier
 520         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 521 
 522         ! call kpreempt if necessary
 523         btst    KPREEMPT_FLAG, %l1
 524         bz,pt   %icc, 2f
 525         nop
 526         call    kpreempt
 527         rdpr    %pil, %o0       ! pass %pil
 528 2:
 529         btst    LOFAULT_SET, %l1
 530         bnz,pn  %ncc, 3f
 531         nop
 532         ret
 533         restore %g1, 0, %o0
 534 3:
 535         ! We're here via bcopy. There must have been an error handler
 536         ! in place otherwise we would have died a nasty death already.
 537         jmp     %o5                             ! goto real handler
 538         restore %g0, 0, %o0                     ! dispose of copy window
 539 
 540 /*
 541  * We got here because of a fault in .copyerr.  We can't safely restore fp
 542  * state, so we panic.
 543  */
 544 fp_panic_msg:
 545         .asciz  "Unable to restore fp state after copy operation"
 546 
 547         .align  4
 548 .copyerr2:
 549         set     fp_panic_msg, %o0
 550         call    panic
 551         nop
 552 /*
 553  *  end of .copyerr
 554  */
 555 
 556 #else   /* NIAGARA_IMPL */
 557         save    %sp, -SA(MINFRAME), %sp
 558         set     .copyerr, %l7                   ! copyerr is lofault value
 559         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
 560         or      %o5, LOFAULT_SET, %o5
 561         membar  #Sync                           ! sync error barrier
 562         b       .do_copy                        ! common code
 563         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 564 
 565 /*
 566  * We got here because of a fault during kcopy.
 567  * Errno value is in %g1.
 568  */
 569 .copyerr:
 570         ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
 571         ! into %o5 to indicate it has set t_lofault handler. Need to clear
 572         ! LOFAULT_SET flag before restoring the error handler.
 573         andn    %o5, LOFAULT_SET, %o5
 574         membar  #Sync                           ! sync error barrier
 575         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 576         ret
 577         restore %g1, 0, %o0
 578 #endif  /* NIAGARA_IMPL */
 579 
 580         SET_SIZE(kcopy)
 581 
 582 
 583 /*
 584  * Copy a block of storage - must not overlap (from + len <= to).
 585  */
 586 
 587         ENTRY(bcopy)
 588 #if !defined(NIAGARA_IMPL)
 589         cmp     %o2, FP_COPY                    ! check for small copy/leaf case
 590         bgt,pt  %ncc, .bcopy_more               !
 591         nop
 592 .bcopy_small:                                   ! setup error handler
 593         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
 594         tst     %o5
 595         bz,pt   %icc, .sm_do_copy
 596         sethi   %hi(.sm_copyerr), %o4
 597         or      %o4, %lo(.sm_copyerr), %o4      ! .sm_copyerr is lofault value
 598         membar  #Sync                           ! sync error barrier
 599         stn     %o4, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 600         or      %o5, LOFAULT_SET, %o5           ! Error should trampoline
 601 .sm_do_copy:
 602         mov     %o0, %g1                ! save %o0
 603         cmp     %o2, SHORTCOPY          ! make sure there is enough to align
 604         ble,pt  %ncc, .bc_smallest
 605         andcc   %o1, 0x7, %o3           ! is dest long aligned
 606         bnz,pn  %ncc, .bc_align
 607         andcc   %o1, 1, %o3             ! is dest byte aligned
 608 
 609 ! Destination is long word aligned
 610 .bc_al_src:
 611         andcc   %o0, 7, %o3
 612         brnz,pt %o3, .bc_src_dst_unal8
 613         nop
 614 /*
 615  * Special case for handling when src and dest are both long word aligned
 616  * and total data to move is less than FP_COPY bytes
 617  * Also handles finish up for large block moves, so may be less than 32 bytes
 618  */
 619 .bc_medlong:
 620         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 621         ble,pt  %ncc, .bc_medl31
 622         nop
 623 .bc_medl32:
 624         ldx     [%o0], %o4              ! move 32 bytes
 625         subcc   %o2, 32, %o2            ! decrement length count by 32
 626         stx     %o4, [%o1]
 627         ldx     [%o0+8], %o4
 628         stx     %o4, [%o1+8]
 629         ldx     [%o0+16], %o4
 630         add     %o0, 32, %o0            ! increase src ptr by 32
 631         stx     %o4, [%o1+16]
 632         ldx     [%o0-8], %o4
 633         add     %o1, 32, %o1            ! increase dst ptr by 32
 634         bgu,pt  %ncc, .bc_medl32        ! repeat if at least 32 bytes left
 635         stx     %o4, [%o1-8]
 636 .bc_medl31:
 637         addcc   %o2, 24, %o2            ! adjust count to be off by 7
 638         ble,pt  %ncc, .bc_medl7         ! skip if 7 or fewer bytes left
 639         nop
 640 .bc_medl8:
 641         ldx     [%o0], %o4              ! move 8 bytes
 642         add     %o0, 8, %o0             ! increase src ptr by 8
 643         subcc   %o2, 8, %o2             ! decrease count by 8
 644         add     %o1, 8, %o1             ! increase dst ptr by 8
 645         bgu,pt  %ncc, .bc_medl8
 646         stx     %o4, [%o1-8]
 647 .bc_medl7:
 648         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
 649         bnz,pt  %ncc, .bc_small4        ! do final bytes if not finished
 650 
 651 .bc_smallx:                             ! finish up and exit
 652         tst     %o5
 653         bz,pt   %ncc, .bc_sm_done
 654         andn    %o5, COPY_FLAGS, %o5    ! remove flags from lofault address
 655         membar  #Sync                   ! sync error barrier
 656         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 657 .bc_sm_done:
 658         retl
 659         mov     %g0, %o0
 660 
 661 .bc_small4:
 662         cmp     %o2, 4
 663         blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
 664         nop                             !
 665         ld      [%o0], %o4              ! move 4 bytes
 666         add     %o0, 4, %o0             ! increase src ptr by 4
 667         add     %o1, 4, %o1             ! increase dst ptr by 4
 668         subcc   %o2, 4, %o2             ! decrease count by 4
 669         bz,pt   %ncc, .bc_smallx
 670         stw     %o4, [%o1-4]
 671 
 672 .bc_small3x:                            ! Exactly 1, 2, or 3 bytes remain
 673         subcc   %o2, 1, %o2             ! reduce count for cc test
 674         ldub    [%o0], %o4              ! load one byte
 675         bz,pt   %ncc, .bc_smallx
 676         stb     %o4, [%o1]              ! store one byte
 677         ldub    [%o0+1], %o4            ! load second byte
 678         subcc   %o2, 1, %o2
 679         bz,pt   %ncc, .bc_smallx
 680         stb     %o4, [%o1+1]            ! store second byte
 681         ldub    [%o0+2], %o4            ! load third byte
 682         ba      .bc_smallx
 683         stb     %o4, [%o1+2]            ! store third byte
 684 
 685 .bc_smallest:                           ! 7 or fewer bytes remain
 686         tst     %o2
 687         bz,pt   %ncc, .bc_smallx
 688         cmp     %o2, 4
 689         blt,pt  %ncc, .bc_small3x
 690         nop
 691         ldub    [%o0], %o4              ! read byte
 692         subcc   %o2, 4, %o2             ! reduce count by 4
 693         stb     %o4, [%o1]              ! write byte
 694         ldub    [%o0+1], %o4            ! repeat for total of 4 bytes
 695         add     %o0, 4, %o0             ! advance src by 4
 696         stb     %o4, [%o1+1]
 697         ldub    [%o0-2], %o4
 698         add     %o1, 4, %o1             ! advance dst by 4
 699         stb     %o4, [%o1-2]
 700         ldub    [%o0-1], %o4
 701         bnz,pt  %ncc, .bc_small3x
 702         stb     %o4, [%o1-1]
 703         ba      .bc_smallx
 704         nop
 705 
 706 /*
 707  * Align destination to long word boundary
 708  */
 709 .bc_align:                              ! byte align test in prior branch delay
 710         bnz,pt  %ncc, .bc_al_d1
 711 .bc_al_d1f:                             ! dest is now half word aligned
 712         andcc   %o1, 2, %o3
 713         bnz,pt  %ncc, .bc_al_d2
 714 .bc_al_d2f:                             ! dest is now word aligned
 715         andcc   %o1, 4, %o3             ! is dest longword aligned?
 716         bz,pt   %ncc, .bc_al_src
 717         nop
 718 .bc_al_d4:                              ! dest is word aligned;  src is unknown
 719         ldub    [%o0], %o4              ! move a word (src align unknown)
 720         ldub    [%o0+1], %o3
 721         sll     %o4, 24, %o4            ! position
 722         sll     %o3, 16, %o3            ! position
 723         or      %o4, %o3, %o3           ! merge
 724         ldub    [%o0+2], %o4
 725         sll     %o4, 8, %o4             ! position
 726         or      %o4, %o3, %o3           ! merge
 727         ldub    [%o0+3], %o4
 728         or      %o4, %o3, %o4           ! merge
 729         stw     %o4,[%o1]               ! store four bytes
 730         add     %o0, 4, %o0             ! adjust src by 4
 731         add     %o1, 4, %o1             ! adjust dest by 4
 732         sub     %o2, 4, %o2             ! adjust count by 4
 733         andcc   %o0, 7, %o3             ! check for src long word alignment
 734         brz,pt  %o3, .bc_medlong
 735 .bc_src_dst_unal8:
 736         ! dst is 8-byte aligned, src is not
 737         ! Size is less than FP_COPY
 738         ! Following code is to select for alignment
 739         andcc   %o0, 0x3, %o3           ! test word alignment
 740         bz,pt   %ncc, .bc_medword
 741         nop
 742         andcc   %o0, 0x1, %o3           ! test halfword alignment
 743         bnz,pt  %ncc, .bc_med_byte      ! go to byte move if not halfword
 744         andcc   %o0, 0x2, %o3           ! test which byte alignment
 745         ba      .bc_medhalf
 746         nop
 747 .bc_al_d1:                              ! align dest to half word
 748         ldub    [%o0], %o4              ! move a byte
 749         add     %o0, 1, %o0
 750         stb     %o4, [%o1]
 751         add     %o1, 1, %o1
 752         andcc   %o1, 2, %o3
 753         bz,pt   %ncc, .bc_al_d2f
 754         sub     %o2, 1, %o2
 755 .bc_al_d2:                              ! align dest to word
 756         ldub    [%o0], %o4              ! move a half-word (src align unknown)
 757         ldub    [%o0+1], %o3
 758         sll     %o4, 8, %o4             ! position
 759         or      %o4, %o3, %o4           ! merge
 760         sth     %o4, [%o1]
 761         add     %o0, 2, %o0
 762         add     %o1, 2, %o1
 763         andcc   %o1, 4, %o3             ! is dest longword aligned?
 764         bz,pt   %ncc, .bc_al_src
 765         sub     %o2, 2, %o2
 766         ba      .bc_al_d4
 767         nop
 768 /*
 769  * Handle all cases where src and dest are aligned on word
 770  * boundaries. Use unrolled loops for better performance.
 771  * This option wins over standard large data move when 
 772  * source and destination is in cache for medium
 773  * to short data moves.
 774  */
 775 .bc_medword:
 776         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 777         ble,pt  %ncc, .bc_medw31
 778         nop
 779 .bc_medw32:
 780         ld      [%o0], %o4              ! move a block of 32 bytes
 781         stw     %o4, [%o1]
 782         ld      [%o0+4], %o4
 783         stw     %o4, [%o1+4]
 784         ld      [%o0+8], %o4
 785         stw     %o4, [%o1+8]
 786         ld      [%o0+12], %o4
 787         stw     %o4, [%o1+12]
 788         ld      [%o0+16], %o4
 789         stw     %o4, [%o1+16]
 790         ld      [%o0+20], %o4
 791         subcc   %o2, 32, %o2            ! decrement length count
 792         stw     %o4, [%o1+20]
 793         ld      [%o0+24], %o4
 794         add     %o0, 32, %o0            ! increase src ptr by 32
 795         stw     %o4, [%o1+24]
 796         ld      [%o0-4], %o4
 797         add     %o1, 32, %o1            ! increase dst ptr by 32
 798         bgu,pt  %ncc, .bc_medw32        ! repeat if at least 32 bytes left
 799         stw     %o4, [%o1-4]
 800 .bc_medw31:
 801         addcc   %o2, 24, %o2            ! adjust count to be off by 7
 802         ble,pt  %ncc, .bc_medw7         ! skip if 7 or fewer bytes left
 803         nop                             !
 804 .bc_medw15:
 805         ld      [%o0], %o4              ! move a block of 8 bytes
 806         subcc   %o2, 8, %o2             ! decrement length count
 807         stw     %o4, [%o1]
 808         add     %o0, 8, %o0             ! increase src ptr by 8
 809         ld      [%o0-4], %o4
 810         add     %o1, 8, %o1             ! increase dst ptr by 8
 811         bgu,pt  %ncc, .bc_medw15
 812         stw     %o4, [%o1-4]
 813 .bc_medw7:
 814         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
 815         bz,pt   %ncc, .bc_smallx        ! exit if finished
 816         cmp     %o2, 4
 817         blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
 818         nop                             !
 819         ld      [%o0], %o4              ! move 4 bytes
 820         add     %o0, 4, %o0             ! increase src ptr by 4
 821         add     %o1, 4, %o1             ! increase dst ptr by 4
 822         subcc   %o2, 4, %o2             ! decrease count by 4
 823         bnz     .bc_small3x
 824         stw     %o4, [%o1-4]
 825         ba      .bc_smallx
 826         nop
 827 
 828 .bc_medhalf:
 829         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 830         ble,pt  %ncc, .bc_medh31
 831         nop
 832 .bc_medh32:                             ! load and store block of 32 bytes
 833         subcc   %o2, 32, %o2            ! decrement length count
 834 
 835         lduh    [%o0], %o4              ! move 32 bytes
 836         lduw    [%o0+2], %o3
 837         sllx    %o4, 48, %o4
 838         sllx    %o3, 16, %o3
 839         or      %o4, %o3, %o3
 840         lduh    [%o0+6], %o4
 841         or      %o4, %o3, %o4
 842         stx     %o4, [%o1]
 843 
 844         lduh    [%o0+8], %o4
 845         lduw    [%o0+10], %o3
 846         sllx    %o4, 48, %o4
 847         sllx    %o3, 16, %o3
 848         or      %o4, %o3, %o3
 849         lduh    [%o0+14], %o4
 850         or      %o4, %o3, %o4
 851         stx     %o4, [%o1+8]
 852 
 853         lduh    [%o0+16], %o4
 854         lduw    [%o0+18], %o3
 855         sllx    %o4, 48, %o4
 856         sllx    %o3, 16, %o3
 857         or      %o4, %o3, %o3
 858         lduh    [%o0+22], %o4
 859         or      %o4, %o3, %o4
 860         stx     %o4, [%o1+16]
 861 
 862         add     %o0, 32, %o0            ! increase src ptr by 32
 863         add     %o1, 32, %o1            ! increase dst ptr by 32
 864 
 865         lduh    [%o0-8], %o4
 866         lduw    [%o0-6], %o3
 867         sllx    %o4, 48, %o4
 868         sllx    %o3, 16, %o3
 869         or      %o4, %o3, %o3
 870         lduh    [%o0-2], %o4
 871         or      %o3, %o4, %o4
 872         bgu,pt  %ncc, .bc_medh32        ! repeat if at least 32 bytes left
 873         stx     %o4, [%o1-8]
 874 
 875 .bc_medh31:
 876         addcc   %o2, 24, %o2            ! adjust count to be off by 7
 877         ble,pt  %ncc, .bc_medh7         ! skip if 7 or fewer bytes left
 878         nop                             !
 879 .bc_medh15:
 880         lduh    [%o0], %o4              ! move 16 bytes
 881         subcc   %o2, 8, %o2             ! decrement length count
 882         lduw    [%o0+2], %o3
 883         sllx    %o4, 48, %o4
 884         sllx    %o3, 16, %o3
 885         or      %o4, %o3, %o3
 886         add     %o1, 8, %o1             ! increase dst ptr by 8
 887         lduh    [%o0+6], %o4
 888         add     %o0, 8, %o0             ! increase src ptr by 8
 889         or      %o4, %o3, %o4
 890         bgu,pt  %ncc, .bc_medh15
 891         stx     %o4, [%o1-8]
 892 .bc_medh7:
 893         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
 894         bz,pt   %ncc, .bc_smallx        ! exit if finished
 895         cmp     %o2, 4
 896         blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
 897         nop                             !
 898         lduh    [%o0], %o4
 899         sll     %o4, 16, %o4
 900         lduh    [%o0+2], %o3
 901         or      %o3, %o4, %o4
 902         subcc   %o2, 4, %o2
 903         add     %o0, 4, %o0
 904         add     %o1, 4, %o1
 905         bnz     .bc_small3x
 906         stw     %o4, [%o1-4]
 907         ba      .bc_smallx
 908         nop
 909 
 910         .align 16
 911 .bc_med_byte:
 912         bnz,pt  %ncc, .bc_medbh32a      ! go to correct byte move
 913         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 914         ble,pt  %ncc, .bc_medb31
 915         nop
 916 .bc_medb32:                             ! Alignment 1 or 5
 917         subcc   %o2, 32, %o2            ! decrement length count
 918 
 919         ldub    [%o0], %o4              ! load and store a block of 32 bytes
 920         sllx    %o4, 56, %o3
 921         lduh    [%o0+1], %o4
 922         sllx    %o4, 40, %o4
 923         or      %o4, %o3, %o3
 924         lduw    [%o0+3], %o4
 925         sllx    %o4, 8, %o4
 926         or      %o4, %o3, %o3
 927         ldub    [%o0+7], %o4
 928         or      %o4, %o3, %o4
 929         stx     %o4, [%o1]
 930 
 931         ldub    [%o0+8], %o4
 932         sllx    %o4, 56, %o3
 933         lduh    [%o0+9], %o4
 934         sllx    %o4, 40, %o4
 935         or      %o4, %o3, %o3
 936         lduw    [%o0+11], %o4
 937         sllx    %o4, 8, %o4
 938         or      %o4, %o3, %o3
 939         ldub    [%o0+15], %o4
 940         or      %o4, %o3, %o4
 941         stx     %o4, [%o1+8]
 942 
 943         ldub    [%o0+16], %o4
 944         sllx    %o4, 56, %o3
 945         lduh    [%o0+17], %o4
 946         sllx    %o4, 40, %o4
 947         or      %o4, %o3, %o3
 948         lduw    [%o0+19], %o4
 949         sllx    %o4, 8, %o4
 950         or      %o4, %o3, %o3
 951         ldub    [%o0+23], %o4
 952         or      %o4, %o3, %o4
 953         stx     %o4, [%o1+16]
 954 
 955         add     %o0, 32, %o0            ! increase src ptr by 32
 956         add     %o1, 32, %o1            ! increase dst ptr by 32
 957 
 958         ldub    [%o0-8], %o4
 959         sllx    %o4, 56, %o3
 960         lduh    [%o0-7], %o4
 961         sllx    %o4, 40, %o4
 962         or      %o4, %o3, %o3
 963         lduw    [%o0-5], %o4
 964         sllx    %o4, 8, %o4
 965         or      %o4, %o3, %o3
 966         ldub    [%o0-1], %o4
 967         or      %o4, %o3, %o4
 968         bgu,pt  %ncc, .bc_medb32        ! repeat if at least 32 bytes left
 969         stx     %o4, [%o1-8]
 970 
 971 .bc_medb31:                             ! 31 or fewer bytes remaining
 972         addcc   %o2, 24, %o2            ! adjust count to be off by 7
 973         ble,pt  %ncc, .bc_medb7         ! skip if 7 or fewer bytes left
 974         nop                             !
 975 .bc_medb15:
 976 
 977         ldub    [%o0], %o4              ! load and store a block of 8 bytes
 978         subcc   %o2, 8, %o2             ! decrement length count
 979         sllx    %o4, 56, %o3
 980         lduh    [%o0+1], %o4
 981         sllx    %o4, 40, %o4
 982         or      %o4, %o3, %o3
 983         lduw    [%o0+3], %o4
 984         add     %o1, 8, %o1             ! increase dst ptr by 16
 985         sllx    %o4, 8, %o4
 986         or      %o4, %o3, %o3
 987         ldub    [%o0+7], %o4
 988         add     %o0, 8, %o0             ! increase src ptr by 16
 989         or      %o4, %o3, %o4
 990         bgu,pt  %ncc, .bc_medb15
 991         stx     %o4, [%o1-8]
 992 .bc_medb7:
 993         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
 994         bz,pt   %ncc, .bc_smallx        ! exit if finished
 995         cmp     %o2, 4
 996         blt,pt  %ncc, .bc_small3x       ! skip if less than 4 bytes left
 997         nop                             !
 998         ldub    [%o0], %o4              ! move 4 bytes
 999         sll     %o4, 24, %o3
1000         lduh    [%o0+1], %o4
1001         sll     %o4, 8, %o4
1002         or      %o4, %o3, %o3
1003         ldub    [%o0+3], %o4
1004         or      %o4, %o3, %o4
1005         subcc   %o2, 4, %o2
1006         add     %o0, 4, %o0
1007         add     %o1, 4, %o1
1008         bnz     .bc_small3x
1009         stw     %o4, [%o1-4]
1010         ba      .bc_smallx
1011         nop
1012 
1013         .align 16
1014 .bc_medbh32a:                           ! Alignment 3 or 7
1015         ble,pt  %ncc, .bc_medbh31
1016         nop
1017 .bc_medbh32:                            ! Alignment 3 or 7
1018         subcc   %o2, 32, %o2            ! decrement length count
1019 
1020         ldub    [%o0], %o4              ! load and store a block of 32 bytes
1021         sllx    %o4, 56, %o3
1022         lduw    [%o0+1], %o4
1023         sllx    %o4, 24, %o4
1024         or      %o4, %o3, %o3
1025         lduh    [%o0+5], %o4
1026         sllx    %o4, 8, %o4
1027         or      %o4, %o3, %o3
1028         ldub    [%o0+7], %o4
1029         or      %o4, %o3, %o4
1030         stx     %o4, [%o1]
1031 
1032         ldub    [%o0+8], %o4
1033         sllx    %o4, 56, %o3
1034         lduw    [%o0+9], %o4
1035         sllx    %o4, 24, %o4
1036         or      %o4, %o3, %o3
1037         lduh    [%o0+13], %o4
1038         sllx    %o4, 8, %o4
1039         or      %o4, %o3, %o3
1040         ldub    [%o0+15], %o4
1041         or      %o4, %o3, %o4
1042         stx     %o4, [%o1+8]
1043 
1044         ldub    [%o0+16], %o4
1045         sllx    %o4, 56, %o3
1046         lduw    [%o0+17], %o4
1047         sllx    %o4, 24, %o4
1048         or      %o4, %o3, %o3
1049         lduh    [%o0+21], %o4
1050         sllx    %o4, 8, %o4
1051         or      %o4, %o3, %o3
1052         ldub    [%o0+23], %o4
1053         or      %o4, %o3, %o4
1054         stx     %o4, [%o1+16]
1055 
1056         add     %o0, 32, %o0            ! increase src ptr by 32
1057         add     %o1, 32, %o1            ! increase dst ptr by 32
1058 
1059         ldub    [%o0-8], %o4
1060         sllx    %o4, 56, %o3
1061         lduw    [%o0-7], %o4
1062         sllx    %o4, 24, %o4
1063         or      %o4, %o3, %o3
1064         lduh    [%o0-3], %o4
1065         sllx    %o4, 8, %o4
1066         or      %o4, %o3, %o3
1067         ldub    [%o0-1], %o4
1068         or      %o4, %o3, %o4
1069         bgu,pt  %ncc, .bc_medbh32       ! repeat if at least 32 bytes left
1070         stx     %o4, [%o1-8]
1071 
1072 .bc_medbh31:
1073         addcc   %o2, 24, %o2            ! adjust count to be off by 7
1074         ble,pt  %ncc, .bc_medb7         ! skip if 7 or fewer bytes left
1075         nop                             !
1076 .bc_medbh15:
1077         ldub    [%o0], %o4              ! load and store a block of 8 bytes
1078         sllx    %o4, 56, %o3
1079         lduw    [%o0+1], %o4
1080         sllx    %o4, 24, %o4
1081         or      %o4, %o3, %o3
1082         lduh    [%o0+5], %o4
1083         sllx    %o4, 8, %o4
1084         or      %o4, %o3, %o3
1085         ldub    [%o0+7], %o4
1086         or      %o4, %o3, %o4
1087         stx     %o4, [%o1]
1088         subcc   %o2, 8, %o2             ! decrement length count
1089         add     %o1, 8, %o1             ! increase dst ptr by 8
1090         add     %o0, 8, %o0             ! increase src ptr by 8
1091         bgu,pt  %ncc, .bc_medbh15
1092         stx     %o4, [%o1-8]
1093         ba      .bc_medb7
1094         nop
1095         
1096         SET_SIZE(bcopy)
1097 /*
1098  * The _more entry points are not intended to be used directly by
1099  * any caller from outside this file.  They are provided to allow
1100  * profiling and dtrace of the portions of the copy code that uses
1101  * the floating point registers.
1102 */
1103         ENTRY(bcopy_more)
1104 .bcopy_more:
1105         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1106         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save existing handler
1107         brz,pt  %o5, .do_copy
1108         nop
1109         sethi   %hi(.copyerr), %l7              ! copyerr is lofault value
1110         or      %l7, %lo(.copyerr), %l7
1111         membar  #Sync                           ! sync error barrier
1112         stn     %l7, [THREAD_REG + T_LOFAULT]   ! set t_lofault
1113         ! We've already captured whether t_lofault was zero on entry.
1114         ! We need to mark ourselves as being from bcopy since both
1115         ! kcopy and bcopy use the same code path. If LOFAULT_SET is
1116         ! set and the saved lofault was zero, we won't reset lofault on
1117         ! returning.
1118         or      %o5, LOFAULT_SET, %o5
1119 .do_copy:
1120         ldn     [THREAD_REG + T_LWP], %o3
1121         brnz,pt %o3, 1f
1122         nop
1123 /*
1124  * kpreempt_disable();
1125  */
1126         ldsb    [THREAD_REG +T_PREEMPT], %o3
1127         inc     %o3
1128         stb     %o3, [THREAD_REG + T_PREEMPT]
1129 1:
1130 /*
1131  * Following code is for large copies. We know there is at
1132  * least FP_COPY bytes available. FP regs are used, so
1133  *  we save registers and fp regs before starting
1134  */
1135         rd      %fprs, %g5              ! check for unused fp
1136         or      %o5,FPUSED_FLAG,%o5
1137         ! if fprs.fef == 0, set it.
1138         ! Setting it when already set costs more than checking
1139         andcc   %g5, FPRS_FEF, %g5      ! test FEF, fprs.du = fprs.dl = 0
1140         bz,pt   %ncc, .bc_fp_unused
1141         prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1142         BST_FP_TOSTACK(%o3)
1143         ba      .bc_fp_ready
1144 .bc_fp_unused:
1145         andcc   %i1, 1, %o3             ! is dest byte aligned
1146         wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
1147 .bc_fp_ready:
1148         rd      %gsr, %l5               ! save %gsr value
1149         bnz,pt  %ncc, .bc_big_d1
1150 .bc_big_d1f:                            ! dest is now half word aligned
1151         andcc   %i1, 2, %o3
1152         bnz,pt  %ncc, .bc_big_d2
1153 .bc_big_d2f:                            ! dest is now word aligned
1154         andcc   %i1, 4, %o3
1155         bnz,pt  %ncc, .bc_big_d4
1156 .bc_big_d4f:                            ! dest is now long word aligned
1157         andcc   %i0, 7, %o3             ! is src long word aligned
1158         brnz,pt %o3, .bc_big_unal8
1159         prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1160         
1161         ! Src and dst are long word aligned
1162         ! align dst to 64 byte boundary
1163         andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
1164         brz,pn  %o3, .bc_al_to_64
1165         nop
1166         sub     %o3, 64, %o3            ! %o3 has negative bytes to move
1167         add     %i2, %o3, %i2           ! adjust remaining count
1168         andcc   %o3, 8, %o4             ! odd long words to move?
1169         brz,pt  %o4, .bc_al_to_16
1170         nop
1171         add     %o3, 8, %o3
1172         ldx     [%i0], %o4
1173         add     %i0, 8, %i0             ! increment src ptr
1174         add     %i1, 8, %i1             ! increment dst ptr
1175         stx     %o4, [%i1-8]
1176 ! Dest is aligned on 16 bytes, src 8 byte aligned
1177 .bc_al_to_16:
1178         andcc   %o3, 0x30, %o4          ! pair of long words to move?
1179         brz,pt  %o4, .bc_al_to_64
1180         nop
1181 .bc_al_mv_16:
1182         add     %o3, 16, %o3
1183         ldx     [%i0], %o4
1184         stx     %o4, [%i1]
1185         ldx     [%i0+8], %o4
1186         add     %i0, 16, %i0            ! increment src ptr
1187         stx     %o4, [%i1+8]
1188         andcc   %o3, 48, %o4
1189         brnz,pt %o4, .bc_al_mv_16
1190         add     %i1, 16, %i1            ! increment dst ptr
1191 ! Dest is aligned on 64 bytes, src 8 byte aligned
1192 .bc_al_to_64:
1193         ! Determine source alignment
1194         ! to correct 8 byte offset
1195         andcc   %i0, 32, %o3
1196         brnz,pn %o3, .bc_aln_1
1197         andcc   %i0, 16, %o3
1198         brnz,pn %o3, .bc_aln_01
1199         andcc   %i0, 8, %o3
1200         brz,pn  %o3, .bc_aln_000
1201         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1202         ba      .bc_aln_001
1203         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1204 
1205 .bc_aln_01:
1206         brnz,pn %o3, .bc_aln_011
1207         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1208         ba      .bc_aln_010
1209         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1210 .bc_aln_1:
1211         andcc   %i0, 16, %o3
1212         brnz,pn %o3, .bc_aln_11
1213         andcc   %i0, 8, %o3
1214         brnz,pn %o3, .bc_aln_101
1215         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1216         ba      .bc_aln_100
1217         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1218 .bc_aln_11:
1219         brz,pn  %o3, .bc_aln_110
1220         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1221 
1222 .bc_aln_111:
1223 ! Alignment off by 8 bytes
1224         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1225         ldd     [%i0], %d0
1226         add     %i0, 8, %i0
1227         sub     %i2, 8, %i2
1228         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1229         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1230         sub     %i1, %i0, %i1
1231 .bc_aln_111_loop:
1232         ldda    [%i0]ASI_BLK_P,%d16             ! block load
1233         subcc   %o3, 64, %o3
1234         fmovd   %d16, %d2
1235         fmovd   %d18, %d4
1236         fmovd   %d20, %d6
1237         fmovd   %d22, %d8
1238         fmovd   %d24, %d10
1239         fmovd   %d26, %d12
1240         fmovd   %d28, %d14
1241         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1242         stda    %d0,[%i0+%i1]ASI_BLK_P
1243         add     %i0, 64, %i0
1244         fmovd   %d30, %d0
1245         bgt,pt  %ncc, .bc_aln_111_loop
1246         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1247         add     %i1, %i0, %i1
1248 
1249         std     %d0, [%i1]
1250         ba      .bc_remain_stuff
1251         add     %i1, 8, %i1
1252         ! END OF aln_111
1253 
1254 .bc_aln_110:
1255 ! Alignment off by 16 bytes
1256         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1257         ldd     [%i0], %d0
1258         ldd     [%i0+8], %d2
1259         add     %i0, 16, %i0
1260         sub     %i2, 16, %i2
1261         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1262         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1263         sub     %i1, %i0, %i1
1264 .bc_aln_110_loop:
1265         ldda    [%i0]ASI_BLK_P,%d16             ! block load
1266         subcc   %o3, 64, %o3
1267         fmovd   %d16, %d4
1268         fmovd   %d18, %d6
1269         fmovd   %d20, %d8
1270         fmovd   %d22, %d10
1271         fmovd   %d24, %d12
1272         fmovd   %d26, %d14
1273         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1274         stda    %d0,[%i0+%i1]ASI_BLK_P
1275         add     %i0, 64, %i0
1276         fmovd   %d28, %d0
1277         fmovd   %d30, %d2
1278         bgt,pt  %ncc, .bc_aln_110_loop
1279         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1280         add     %i1, %i0, %i1
1281 
1282         std     %d0, [%i1]
1283         std     %d2, [%i1+8]
1284         ba      .bc_remain_stuff
1285         add     %i1, 16, %i1
1286         ! END OF aln_110
1287 
1288 .bc_aln_101:
1289 ! Alignment off by 24 bytes
1290         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1291         ldd     [%i0], %d0
1292         ldd     [%i0+8], %d2
1293         ldd     [%i0+16], %d4
1294         add     %i0, 24, %i0
1295         sub     %i2, 24, %i2
1296         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1297         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1298         sub     %i1, %i0, %i1
1299 .bc_aln_101_loop:
1300         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1301         subcc   %o3, 64, %o3
1302         fmovd   %d16, %d6
1303         fmovd   %d18, %d8
1304         fmovd   %d20, %d10
1305         fmovd   %d22, %d12
1306         fmovd   %d24, %d14
1307         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1308         stda    %d0,[%i0+%i1]ASI_BLK_P
1309         add     %i0, 64, %i0
1310         fmovd   %d26, %d0
1311         fmovd   %d28, %d2
1312         fmovd   %d30, %d4
1313         bgt,pt  %ncc, .bc_aln_101_loop
1314         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1315         add     %i1, %i0, %i1
1316 
1317         std     %d0, [%i1]
1318         std     %d2, [%i1+8]
1319         std     %d4, [%i1+16]
1320         ba      .bc_remain_stuff
1321         add     %i1, 24, %i1
1322         ! END OF aln_101
1323 
1324 .bc_aln_100:
1325 ! Alignment off by 32 bytes
1326         ldd     [%i0], %d0
1327         ldd     [%i0+8], %d2
1328         ldd     [%i0+16],%d4
1329         ldd     [%i0+24],%d6
1330         add     %i0, 32, %i0
1331         sub     %i2, 32, %i2
1332         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1333         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1334         sub     %i1, %i0, %i1
1335 .bc_aln_100_loop:
1336         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1337         subcc   %o3, 64, %o3
1338         fmovd   %d16, %d8
1339         fmovd   %d18, %d10
1340         fmovd   %d20, %d12
1341         fmovd   %d22, %d14
1342         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1343         stda    %d0,[%i0+%i1]ASI_BLK_P
1344         add     %i0, 64, %i0
1345         fmovd   %d24, %d0
1346         fmovd   %d26, %d2
1347         fmovd   %d28, %d4
1348         fmovd   %d30, %d6
1349         bgt,pt  %ncc, .bc_aln_100_loop
1350         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1351         add     %i1, %i0, %i1
1352 
1353         std     %d0, [%i1]
1354         std     %d2, [%i1+8]
1355         std     %d4, [%i1+16]
1356         std     %d6, [%i1+24]
1357         ba      .bc_remain_stuff
1358         add     %i1, 32, %i1
1359         ! END OF aln_100
1360 
1361 .bc_aln_011:
1362 ! Alignment off by 40 bytes
1363         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1364         ldd     [%i0], %d0
1365         ldd     [%i0+8], %d2
1366         ldd     [%i0+16], %d4
1367         ldd     [%i0+24], %d6
1368         ldd     [%i0+32], %d8
1369         add     %i0, 40, %i0
1370         sub     %i2, 40, %i2
1371         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1372         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1373         sub     %i1, %i0, %i1
1374 .bc_aln_011_loop:
1375         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1376         subcc   %o3, 64, %o3
1377         fmovd   %d16, %d10
1378         fmovd   %d18, %d12
1379         fmovd   %d20, %d14
1380         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1381         stda    %d0,[%i0+%i1]ASI_BLK_P
1382         add     %i0, 64, %i0
1383         fmovd   %d22, %d0
1384         fmovd   %d24, %d2
1385         fmovd   %d26, %d4
1386         fmovd   %d28, %d6
1387         fmovd   %d30, %d8
1388         bgt,pt  %ncc, .bc_aln_011_loop
1389         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1390         add     %i1, %i0, %i1
1391 
1392         std     %d0, [%i1]
1393         std     %d2, [%i1+8]
1394         std     %d4, [%i1+16]
1395         std     %d6, [%i1+24]
1396         std     %d8, [%i1+32]
1397         ba      .bc_remain_stuff
1398         add     %i1, 40, %i1
1399         ! END OF aln_011
1400 
1401 .bc_aln_010:
1402 ! Alignment off by 48 bytes
1403         ldd     [%i0], %d0
1404         ldd     [%i0+8], %d2
1405         ldd     [%i0+16], %d4
1406         ldd     [%i0+24], %d6
1407         ldd     [%i0+32], %d8
1408         ldd     [%i0+40], %d10
1409         add     %i0, 48, %i0
1410         sub     %i2, 48, %i2
1411         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1412         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1413         sub     %i1, %i0, %i1
1414 .bc_aln_010_loop:
1415         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1416         subcc   %o3, 64, %o3
1417         fmovd   %d16, %d12
1418         fmovd   %d18, %d14
1419         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1420         stda    %d0,[%i0+%i1]ASI_BLK_P
1421         add     %i0, 64, %i0
1422         fmovd   %d20, %d0
1423         fmovd   %d22, %d2
1424         fmovd   %d24, %d4
1425         fmovd   %d26, %d6
1426         fmovd   %d28, %d8
1427         fmovd   %d30, %d10
1428         bgt,pt  %ncc, .bc_aln_010_loop
1429         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1430         add     %i1, %i0, %i1
1431 
1432         std     %d0, [%i1]
1433         std     %d2, [%i1+8]
1434         std     %d4, [%i1+16]
1435         std     %d6, [%i1+24]
1436         std     %d8, [%i1+32]
1437         std     %d10, [%i1+40]
1438         ba      .bc_remain_stuff
1439         add     %i1, 48, %i1
1440         ! END OF aln_010
1441 
1442 .bc_aln_001:
1443 ! Alignment off by 56 bytes
1444         ldd     [%i0], %d0
1445         ldd     [%i0+8], %d2
1446         ldd     [%i0+16], %d4
1447         ldd     [%i0+24], %d6
1448         ldd     [%i0+32], %d8
1449         ldd     [%i0+40], %d10
1450         ldd     [%i0+48], %d12
1451         add     %i0, 56, %i0
1452         sub     %i2, 56, %i2
1453         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1454         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1455         sub     %i1, %i0, %i1
1456 .bc_aln_001_loop:
1457         ldda    [%i0]ASI_BLK_P,%d16     ! block load
1458         subcc   %o3, 64, %o3
1459         fmovd   %d16, %d14
1460         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1461         stda    %d0,[%i0+%i1]ASI_BLK_P
1462         add     %i0, 64, %i0
1463         fmovd   %d18, %d0
1464         fmovd   %d20, %d2
1465         fmovd   %d22, %d4
1466         fmovd   %d24, %d6
1467         fmovd   %d26, %d8
1468         fmovd   %d28, %d10
1469         fmovd   %d30, %d12
1470         bgt,pt  %ncc, .bc_aln_001_loop
1471         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1472         add     %i1, %i0, %i1
1473 
1474         std     %d0, [%i1]
1475         std     %d2, [%i1+8]
1476         std     %d4, [%i1+16]
1477         std     %d6, [%i1+24]
1478         std     %d8, [%i1+32]
1479         std     %d10, [%i1+40]
1480         std     %d12, [%i1+48]
1481         ba      .bc_remain_stuff
1482         add     %i1, 56, %i1
1483         ! END OF aln_001
1484 
1485 .bc_aln_000:
1486         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1487         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
1488         and     %i2, 0x7f, %i2          ! residue bytes in %i2
1489         sub     %i1, %i0, %i1
1490 .bc_aln_000_loop:
1491         ldda    [%i0]ASI_BLK_P,%d0
1492         subcc   %o3, 64, %o3
1493         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
1494         stda    %d0,[%i0+%i1]ASI_BLK_P
1495         add     %i0, 64, %i0
1496         bgt,pt  %ncc, .bc_aln_000_loop
1497         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1498         add     %i1, %i0, %i1
1499 
1500         ! END OF aln_000
1501 
1502 .bc_remain_stuff:
1503         subcc   %i2, 31, %i2            ! adjust length to allow cc test
1504         ble,pt  %ncc, .bc_aln_31
1505         nop
1506 .bc_aln_32:
1507         ldx     [%i0], %o4              ! move 32 bytes
1508         subcc   %i2, 32, %i2            ! decrement length count by 32
1509         stx     %o4, [%i1]
1510         ldx     [%i0+8], %o4
1511         stx     %o4, [%i1+8]
1512         ldx     [%i0+16], %o4
1513         add     %i0, 32, %i0            ! increase src ptr by 32
1514         stx     %o4, [%i1+16]
1515         ldx     [%i0-8], %o4
1516         add     %i1, 32, %i1            ! increase dst ptr by 32
1517         bgu,pt  %ncc, .bc_aln_32        ! repeat if at least 32 bytes left
1518         stx     %o4, [%i1-8]
1519 .bc_aln_31:
1520         addcc   %i2, 24, %i2            ! adjust count to be off by 7
1521         ble,pt  %ncc, .bc_aln_7         ! skip if 7 or fewer bytes left
1522         nop                             !
1523 .bc_aln_15:
1524         ldx     [%i0], %o4              ! move 8 bytes
1525         add     %i0, 8, %i0             ! increase src ptr by 8
1526         subcc   %i2, 8, %i2             ! decrease count by 8
1527         add     %i1, 8, %i1             ! increase dst ptr by 8
1528         bgu,pt  %ncc, .bc_aln_15
1529         stx     %o4, [%i1-8]            !
1530 .bc_aln_7:
1531         addcc   %i2, 7, %i2             ! finish adjustment of remaining count
1532         bz,pt   %ncc, .bc_exit          ! exit if finished
1533         cmp     %i2, 4
1534         blt,pt  %ncc, .bc_unaln3x       ! skip if less than 4 bytes left
1535         nop                             !
1536         ld      [%i0], %o4              ! move 4 bytes
1537         add     %i0, 4, %i0             ! increase src ptr by 4
1538         add     %i1, 4, %i1             ! increase dst ptr by 4
1539         subcc   %i2, 4, %i2             ! decrease count by 4
1540         bnz     .bc_unaln3x
1541         stw     %o4, [%i1-4]
1542         ba      .bc_exit
1543         nop
1544 
1545         ! destination alignment code
1546 .bc_big_d1:
1547         ldub    [%i0], %o4              ! move a byte
1548         add     %i0, 1, %i0
1549         stb     %o4, [%i1]
1550         add     %i1, 1, %i1
1551         andcc   %i1, 2, %o3
1552         bz,pt   %ncc, .bc_big_d2f
1553         sub     %i2, 1, %i2
1554 .bc_big_d2:
1555         ldub    [%i0], %o4              ! move a half-word (src align unknown)
1556         ldub    [%i0+1], %o3
1557         add     %i0, 2, %i0
1558         sll     %o4, 8, %o4             ! position
1559         or      %o4, %o3, %o4           ! merge
1560         sth     %o4, [%i1]
1561         add     %i1, 2, %i1
1562         andcc   %i1, 4, %o3
1563         bz,pt   %ncc, .bc_big_d4f
1564         sub     %i2, 2, %i2
1565 .bc_big_d4:
1566         ldub    [%i0], %o4              ! move a word (src align unknown)
1567         ldub    [%i0+1], %o3
1568         sll     %o4, 24, %o4            ! position
1569         sll     %o3, 16, %o3            ! position
1570         or      %o4, %o3, %o3           ! merge
1571         ldub    [%i0+2], %o4
1572         sll     %o4, 8, %o4             ! position
1573         or      %o4, %o3, %o3           ! merge
1574         ldub    [%i0+3], %o4
1575         or      %o4, %o3, %o4           ! merge
1576         stw     %o4,[%i1]               ! store four bytes
1577         add     %i0, 4, %i0             ! adjust src by 4
1578         add     %i1, 4, %i1             ! adjust dest by 4
1579         ba      .bc_big_d4f
1580         sub     %i2, 4, %i2             ! adjust count by 4
1581 
1582 
1583         ! Dst is on 8 byte boundary; src is not;
1584 .bc_big_unal8:
1585         andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
1586         bz      %ncc, .bc_unalnsrc
1587         sub     %o3, 64, %o3            ! %o3 will be multiple of 8
1588         neg     %o3                     ! bytes until dest is 64 byte aligned
1589         sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
1590         ! Move bytes according to source alignment
1591         andcc   %i0, 0x1, %o4
1592         bnz     %ncc, .bc_unalnbyte     ! check for byte alignment
1593         nop
1594         andcc   %i0, 2, %o4             ! check for half word alignment
1595         bnz     %ncc, .bc_unalnhalf
1596         nop
1597         ! Src is word aligned, move bytes until dest 64 byte aligned
1598 .bc_unalnword:
1599         ld      [%i0], %o4              ! load 4 bytes
1600         stw     %o4, [%i1]              ! and store 4 bytes
1601         ld      [%i0+4], %o4            ! load 4 bytes
1602         add     %i0, 8, %i0             ! increase src ptr by 8
1603         stw     %o4, [%i1+4]            ! and store 4 bytes
1604         subcc   %o3, 8, %o3             ! decrease count by 8
1605         bnz     %ncc, .bc_unalnword
1606         add     %i1, 8, %i1             ! increase dst ptr by 8
1607         ba      .bc_unalnsrc
1608         nop
1609 
1610         ! Src is half-word aligned, move bytes until dest 64 byte aligned
1611 .bc_unalnhalf:
1612         lduh    [%i0], %o4              ! load 2 bytes
1613         sllx    %o4, 32, %i3            ! shift left
1614         lduw    [%i0+2], %o4
1615         or      %o4, %i3, %i3
1616         sllx    %i3, 16, %i3
1617         lduh    [%i0+6], %o4
1618         or      %o4, %i3, %i3
1619         stx     %i3, [%i1]
1620         add     %i0, 8, %i0
1621         subcc   %o3, 8, %o3
1622         bnz     %ncc, .bc_unalnhalf
1623         add     %i1, 8, %i1
1624         ba      .bc_unalnsrc
1625         nop
1626 
1627         ! Src is Byte aligned, move bytes until dest 64 byte aligned
1628 .bc_unalnbyte:
1629         sub     %i1, %i0, %i1           ! share pointer advance
1630 .bc_unalnbyte_loop:
1631         ldub    [%i0], %o4
1632         sllx    %o4, 56, %i3
1633         lduh    [%i0+1], %o4
1634         sllx    %o4, 40, %o4
1635         or      %o4, %i3, %i3
1636         lduh    [%i0+3], %o4
1637         sllx    %o4, 24, %o4
1638         or      %o4, %i3, %i3
1639         lduh    [%i0+5], %o4
1640         sllx    %o4, 8, %o4
1641         or      %o4, %i3, %i3
1642         ldub    [%i0+7], %o4
1643         or      %o4, %i3, %i3
1644         stx     %i3, [%i1+%i0]
1645         subcc   %o3, 8, %o3
1646         bnz     %ncc, .bc_unalnbyte_loop
1647         add     %i0, 8, %i0
1648         add     %i1,%i0, %i1            ! restore pointer
1649 
1650         ! Destination is now block (64 byte aligned), src is not 8 byte aligned
1651 .bc_unalnsrc:
1652         andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
1653         and     %i2, 0x3f, %i2          ! residue bytes in %i2
1654         add     %i2, 64, %i2            ! Insure we don't load beyond
1655         sub     %i3, 64, %i3            ! end of source buffer
1656 
1657         andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
1658         prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1659         alignaddr %i0, %g0, %g0         ! generate %gsr
1660         add     %i0, %i3, %i0           ! advance %i0 to after blocks
1661         !
1662         ! Determine source alignment to correct 8 byte offset
1663         andcc   %i0, 0x20, %o3
1664         brnz,pn %o3, .bc_unaln_1
1665         andcc   %i0, 0x10, %o3
1666         brnz,pn %o3, .bc_unaln_01
1667         andcc   %i0, 0x08, %o3
1668         brz,a   %o3, .bc_unaln_000
1669         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1670         ba      .bc_unaln_001
1671         nop
1672 .bc_unaln_01:
1673         brnz,a  %o3, .bc_unaln_011
1674         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1675         ba      .bc_unaln_010
1676         nop
1677 .bc_unaln_1:
1678         brnz,pn %o3, .bc_unaln_11
1679         andcc   %i0, 0x08, %o3
1680         brnz,a  %o3, .bc_unaln_101
1681         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1682         ba      .bc_unaln_100
1683         nop
1684 .bc_unaln_11:
1685         brz,pn  %o3, .bc_unaln_110
1686         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1687 
1688 .bc_unaln_111:
1689         ldd     [%o4+56], %d14
1690 .bc_unaln_111_loop:
1691         add     %o4, 64, %o4
1692         ldda    [%o4]ASI_BLK_P, %d16
1693         faligndata %d14, %d16, %d48
1694         faligndata %d16, %d18, %d50
1695         faligndata %d18, %d20, %d52
1696         faligndata %d20, %d22, %d54
1697         faligndata %d22, %d24, %d56
1698         faligndata %d24, %d26, %d58
1699         faligndata %d26, %d28, %d60
1700         faligndata %d28, %d30, %d62
1701         fmovd   %d30, %d14
1702         stda    %d48, [%i1]ASI_BLK_P
1703         subcc   %i3, 64, %i3
1704         add     %i1, 64, %i1
1705         bgu,pt  %ncc, .bc_unaln_111_loop
1706         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1707         ba      .bc_unaln_done
1708         nop
1709 
1710 .bc_unaln_110:
1711         ldd     [%o4+48], %d12
1712         ldd     [%o4+56], %d14
1713 .bc_unaln_110_loop:
1714         add     %o4, 64, %o4
1715         ldda    [%o4]ASI_BLK_P, %d16
1716         faligndata %d12, %d14, %d48
1717         faligndata %d14, %d16, %d50
1718         faligndata %d16, %d18, %d52
1719         faligndata %d18, %d20, %d54
1720         faligndata %d20, %d22, %d56
1721         faligndata %d22, %d24, %d58
1722         faligndata %d24, %d26, %d60
1723         faligndata %d26, %d28, %d62
1724         fmovd   %d28, %d12
1725         fmovd   %d30, %d14
1726         stda    %d48, [%i1]ASI_BLK_P
1727         subcc   %i3, 64, %i3
1728         add     %i1, 64, %i1
1729         bgu,pt  %ncc, .bc_unaln_110_loop
1730         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1731         ba      .bc_unaln_done
1732         nop
1733 
1734 .bc_unaln_101:
1735         ldd     [%o4+40], %d10
1736         ldd     [%o4+48], %d12
1737         ldd     [%o4+56], %d14
1738 .bc_unaln_101_loop:
1739         add     %o4, 64, %o4
1740         ldda    [%o4]ASI_BLK_P, %d16
1741         faligndata %d10, %d12, %d48
1742         faligndata %d12, %d14, %d50
1743         faligndata %d14, %d16, %d52
1744         faligndata %d16, %d18, %d54
1745         faligndata %d18, %d20, %d56
1746         faligndata %d20, %d22, %d58
1747         faligndata %d22, %d24, %d60
1748         faligndata %d24, %d26, %d62
1749         fmovd   %d26, %d10
1750         fmovd   %d28, %d12
1751         fmovd   %d30, %d14
1752         stda    %d48, [%i1]ASI_BLK_P
1753         subcc   %i3, 64, %i3
1754         add     %i1, 64, %i1
1755         bgu,pt  %ncc, .bc_unaln_101_loop
1756         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1757         ba      .bc_unaln_done
1758         nop
1759 
1760 .bc_unaln_100:
1761         ldd     [%o4+32], %d8
1762         ldd     [%o4+40], %d10
1763         ldd     [%o4+48], %d12
1764         ldd     [%o4+56], %d14
1765 .bc_unaln_100_loop:
1766         add     %o4, 64, %o4
1767         ldda    [%o4]ASI_BLK_P, %d16
1768         faligndata %d8, %d10, %d48
1769         faligndata %d10, %d12, %d50
1770         faligndata %d12, %d14, %d52
1771         faligndata %d14, %d16, %d54
1772         faligndata %d16, %d18, %d56
1773         faligndata %d18, %d20, %d58
1774         faligndata %d20, %d22, %d60
1775         faligndata %d22, %d24, %d62
1776         fmovd   %d24, %d8
1777         fmovd   %d26, %d10
1778         fmovd   %d28, %d12
1779         fmovd   %d30, %d14
1780         stda    %d48, [%i1]ASI_BLK_P
1781         subcc   %i3, 64, %i3
1782         add     %i1, 64, %i1
1783         bgu,pt  %ncc, .bc_unaln_100_loop
1784         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1785         ba      .bc_unaln_done
1786         nop
1787 
1788 .bc_unaln_011:
1789         ldd     [%o4+24], %d6
1790         ldd     [%o4+32], %d8
1791         ldd     [%o4+40], %d10
1792         ldd     [%o4+48], %d12
1793         ldd     [%o4+56], %d14
1794 .bc_unaln_011_loop:
1795         add     %o4, 64, %o4
1796         ldda    [%o4]ASI_BLK_P, %d16
1797         faligndata %d6, %d8, %d48
1798         faligndata %d8, %d10, %d50
1799         faligndata %d10, %d12, %d52
1800         faligndata %d12, %d14, %d54
1801         faligndata %d14, %d16, %d56
1802         faligndata %d16, %d18, %d58
1803         faligndata %d18, %d20, %d60
1804         faligndata %d20, %d22, %d62
1805         fmovd   %d22, %d6
1806         fmovd   %d24, %d8
1807         fmovd   %d26, %d10
1808         fmovd   %d28, %d12
1809         fmovd   %d30, %d14
1810         stda    %d48, [%i1]ASI_BLK_P
1811         subcc   %i3, 64, %i3
1812         add     %i1, 64, %i1
1813         bgu,pt  %ncc, .bc_unaln_011_loop
1814         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1815         ba      .bc_unaln_done
1816         nop
1817 
1818 .bc_unaln_010:
1819         ldd     [%o4+16], %d4
1820         ldd     [%o4+24], %d6
1821         ldd     [%o4+32], %d8
1822         ldd     [%o4+40], %d10
1823         ldd     [%o4+48], %d12
1824         ldd     [%o4+56], %d14
1825 .bc_unaln_010_loop:
1826         add     %o4, 64, %o4
1827         ldda    [%o4]ASI_BLK_P, %d16
1828         faligndata %d4, %d6, %d48
1829         faligndata %d6, %d8, %d50
1830         faligndata %d8, %d10, %d52
1831         faligndata %d10, %d12, %d54
1832         faligndata %d12, %d14, %d56
1833         faligndata %d14, %d16, %d58
1834         faligndata %d16, %d18, %d60
1835         faligndata %d18, %d20, %d62
1836         fmovd   %d20, %d4
1837         fmovd   %d22, %d6
1838         fmovd   %d24, %d8
1839         fmovd   %d26, %d10
1840         fmovd   %d28, %d12
1841         fmovd   %d30, %d14
1842         stda    %d48, [%i1]ASI_BLK_P
1843         subcc   %i3, 64, %i3
1844         add     %i1, 64, %i1
1845         bgu,pt  %ncc, .bc_unaln_010_loop
1846         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1847         ba      .bc_unaln_done
1848         nop
1849 
1850 .bc_unaln_001:
1851         ldd     [%o4+8], %d2
1852         ldd     [%o4+16], %d4
1853         ldd     [%o4+24], %d6
1854         ldd     [%o4+32], %d8
1855         ldd     [%o4+40], %d10
1856         ldd     [%o4+48], %d12
1857         ldd     [%o4+56], %d14
1858 .bc_unaln_001_loop:
1859         add     %o4, 64, %o4
1860         ldda    [%o4]ASI_BLK_P, %d16
1861         faligndata %d2, %d4, %d48
1862         faligndata %d4, %d6, %d50
1863         faligndata %d6, %d8, %d52
1864         faligndata %d8, %d10, %d54
1865         faligndata %d10, %d12, %d56
1866         faligndata %d12, %d14, %d58
1867         faligndata %d14, %d16, %d60
1868         faligndata %d16, %d18, %d62
1869         fmovd   %d18, %d2
1870         fmovd   %d20, %d4
1871         fmovd   %d22, %d6
1872         fmovd   %d24, %d8
1873         fmovd   %d26, %d10
1874         fmovd   %d28, %d12
1875         fmovd   %d30, %d14
1876         stda    %d48, [%i1]ASI_BLK_P
1877         subcc   %i3, 64, %i3
1878         add     %i1, 64, %i1
1879         bgu,pt  %ncc, .bc_unaln_001_loop
1880         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1881         ba      .bc_unaln_done
1882         nop
1883 
1884 .bc_unaln_000:
1885         ldda    [%o4]ASI_BLK_P, %d0
1886 .bc_unaln_000_loop:
1887         add     %o4, 64, %o4
1888         ldda    [%o4]ASI_BLK_P, %d16
1889         faligndata %d0, %d2, %d48
1890         faligndata %d2, %d4, %d50
1891         faligndata %d4, %d6, %d52
1892         faligndata %d6, %d8, %d54
1893         faligndata %d8, %d10, %d56
1894         faligndata %d10, %d12, %d58
1895         faligndata %d12, %d14, %d60
1896         faligndata %d14, %d16, %d62
1897         fmovd   %d16, %d0
1898         fmovd   %d18, %d2
1899         fmovd   %d20, %d4
1900         fmovd   %d22, %d6
1901         fmovd   %d24, %d8
1902         fmovd   %d26, %d10
1903         fmovd   %d28, %d12
1904         fmovd   %d30, %d14
1905         stda    %d48, [%i1]ASI_BLK_P
1906         subcc   %i3, 64, %i3
1907         add     %i1, 64, %i1
1908         bgu,pt  %ncc, .bc_unaln_000_loop
1909         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1910 
1911 .bc_unaln_done:
1912         ! Handle trailing bytes, 64 to 127
1913         ! Dest long word aligned, Src not long word aligned
1914         cmp     %i2, 15
1915         bleu    %ncc, .bc_unaln_short
1916 
1917         andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
1918         and     %i2, 0x7, %i2           ! residue bytes in %i2
1919         add     %i2, 8, %i2
1920         sub     %i3, 8, %i3             ! insure we don't load past end of src
1921         andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
1922         add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
1923         ldd     [%o4], %d0              ! fetch partial word
1924 .bc_unaln_by8:
1925         ldd     [%o4+8], %d2
1926         add     %o4, 8, %o4
1927         faligndata %d0, %d2, %d16
1928         subcc   %i3, 8, %i3
1929         std     %d16, [%i1]
1930         fmovd   %d2, %d0
1931         bgu,pt  %ncc, .bc_unaln_by8
1932         add     %i1, 8, %i1
1933 
1934 .bc_unaln_short:
1935         cmp     %i2, 8
1936         blt,pt  %ncc, .bc_unalnfin
1937         nop
1938         ldub    [%i0], %o4
1939         sll     %o4, 24, %o3
1940         ldub    [%i0+1], %o4
1941         sll     %o4, 16, %o4
1942         or      %o4, %o3, %o3
1943         ldub    [%i0+2], %o4
1944         sll     %o4, 8, %o4
1945         or      %o4, %o3, %o3
1946         ldub    [%i0+3], %o4
1947         or      %o4, %o3, %o3
1948         stw     %o3, [%i1]
1949         ldub    [%i0+4], %o4
1950         sll     %o4, 24, %o3
1951         ldub    [%i0+5], %o4
1952         sll     %o4, 16, %o4
1953         or      %o4, %o3, %o3
1954         ldub    [%i0+6], %o4
1955         sll     %o4, 8, %o4
1956         or      %o4, %o3, %o3
1957         ldub    [%i0+7], %o4
1958         or      %o4, %o3, %o3
1959         stw     %o3, [%i1+4]
1960         add     %i0, 8, %i0
1961         add     %i1, 8, %i1
1962         sub     %i2, 8, %i2
1963 .bc_unalnfin:
1964         cmp     %i2, 4
1965         blt,pt  %ncc, .bc_unalnz
1966         tst     %i2
1967         ldub    [%i0], %o3              ! read byte
1968         subcc   %i2, 4, %i2             ! reduce count by 4
1969         sll     %o3, 24, %o3            ! position
1970         ldub    [%i0+1], %o4
1971         sll     %o4, 16, %o4            ! position
1972         or      %o4, %o3, %o3           ! merge
1973         ldub    [%i0+2], %o4
1974         sll     %o4, 8, %o4             ! position
1975         or      %o4, %o3, %o3           ! merge
1976         add     %i1, 4, %i1             ! advance dst by 4
1977         ldub    [%i0+3], %o4
1978         add     %i0, 4, %i0             ! advance src by 4
1979         or      %o4, %o3, %o4           ! merge
1980         bnz,pt  %ncc, .bc_unaln3x
1981         stw     %o4, [%i1-4]
1982         ba      .bc_exit
1983         nop
1984 .bc_unalnz:
1985         bz,pt   %ncc, .bc_exit
1986 .bc_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
1987         subcc   %i2, 1, %i2             ! reduce count for cc test
1988         ldub    [%i0], %o4              ! load one byte
1989         bz,pt   %ncc, .bc_exit
1990         stb     %o4, [%i1]              ! store one byte
1991         ldub    [%i0+1], %o4            ! load second byte
1992         subcc   %i2, 1, %i2
1993         bz,pt   %ncc, .bc_exit
1994         stb     %o4, [%i1+1]            ! store second byte
1995         ldub    [%i0+2], %o4            ! load third byte
1996         stb     %o4, [%i1+2]            ! store third byte
1997 .bc_exit:
1998         wr      %l5, %g0, %gsr          ! restore %gsr
1999         brnz    %g5, .bc_fp_restore
2000         and     %o5, COPY_FLAGS, %l1    ! save flags in %l1
2001         FZERO
2002         wr      %g5, %g0, %fprs
2003         ba,pt   %ncc, .bc_ex2
2004         nop
2005 .bc_fp_restore:
2006         BLD_FP_FROMSTACK(%o4)
2007 .bc_ex2:
2008         ldn     [THREAD_REG + T_LWP], %o2
2009         brnz,pt %o2, 1f
2010         nop
2011 
2012         ldsb    [THREAD_REG + T_PREEMPT], %l0
2013         deccc   %l0
2014         bnz,pn  %ncc, 1f
2015         stb     %l0, [THREAD_REG + T_PREEMPT]
2016 
2017         ! Check for a kernel preemption request
2018         ldn     [THREAD_REG + T_CPU], %l0
2019         ldub    [%l0 + CPU_KPRUNRUN], %l0
2020         brnz,a,pt       %l0, 1f ! Need to call kpreempt?
2021         or      %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
2022 1:
2023         btst    LOFAULT_SET, %l1
2024         bz,pn   %icc, 3f
2025         andncc  %o5, COPY_FLAGS, %o5
2026         ! Here via bcopy. Check to see if the handler was NULL.
2027         ! If so, just return quietly. Otherwise, reset the
2028         ! handler and return.
2029         bz,pn %ncc, 2f
2030         nop
2031         membar  #Sync
2032         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2033 2:
2034         btst    KPREEMPT_FLAG, %l1
2035         bz,pt   %icc, 3f
2036         nop
2037         call    kpreempt
2038         rdpr    %pil, %o0               ! pass %pil
2039 3:
2040         ret
2041         restore %g0, 0, %o0
2042         
2043         SET_SIZE(bcopy_more)
2044 
2045 
2046 #else   /* NIAGARA_IMPL */
2047         save    %sp, -SA(MINFRAME), %sp
2048         clr     %o5                     ! flag LOFAULT_SET is not set for bcopy
2049 .do_copy:
2050         cmp     %i2, 12                 ! for small counts
2051         blu     %ncc, .bytecp           ! just copy bytes
2052         .empty
2053 
2054         cmp     %i2, 128                ! for less than 128 bytes
2055         blu,pn  %ncc, .bcb_punt         ! no block st/quad ld
2056         nop
2057 
2058         set     use_hw_bcopy, %o2
2059         ld      [%o2], %o2
2060         brz,pn  %o2, .bcb_punt
2061         nop
2062 
2063         subcc   %i1, %i0, %i3
2064         bneg,a,pn %ncc, 1f
2065         neg     %i3
2066 1:
2067         /*
2068          * Compare against 256 since we should be checking block addresses
2069          * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2070          * src = dest + (64 * 3) + 63.
2071          */
2072         cmp     %i3, 256
2073         blu,pn  %ncc, .bcb_punt
2074         nop
2075 
2076         /*
2077          * Copy that reach here have at least 2 blocks of data to copy.
2078          */
2079 .do_blockcopy:
2080         ! Swap src/dst since the code below is memcpy code
2081         ! and memcpy/bcopy have different calling sequences
2082         mov     %i1, %i5
2083         mov     %i0, %i1
2084         mov     %i5, %i0
2085 
2086         ! Block (64 bytes) align the destination.
2087         andcc   %i0, 0x3f, %i3          ! is dst aligned on a 64 bytes
2088         bz      %xcc, .chksrc           ! dst is already double aligned
2089         sub     %i3, 0x40, %i3
2090         neg     %i3                     ! bytes till dst 64 bytes aligned
2091         sub     %i2, %i3, %i2           ! update i2 with new count
2092 
2093         ! Based on source and destination alignment do
2094         ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2095 
2096         ! Is dst & src 8B aligned
2097         or      %i0, %i1, %o2
2098         andcc   %o2, 0x7, %g0
2099         bz      %ncc, .alewdcp
2100         nop
2101 
2102         ! Is dst & src 4B aligned
2103         andcc   %o2, 0x3, %g0
2104         bz      %ncc, .alwdcp
2105         nop
2106 
2107         ! Is dst & src 2B aligned
2108         andcc   %o2, 0x1, %g0
2109         bz      %ncc, .alhlfwdcp
2110         nop
2111 
2112         ! 1B aligned
2113 1:      ldub    [%i1], %o2
2114         stb     %o2, [%i0]
2115         inc     %i1
2116         deccc   %i3
2117         bgu,pt  %ncc, 1b
2118         inc     %i0
2119 
2120         ba      .chksrc
2121         nop
2122 
2123         ! dst & src 4B aligned
2124 .alwdcp:
2125         ld      [%i1], %o2
2126         st      %o2, [%i0]
2127         add     %i1, 0x4, %i1
2128         subcc   %i3, 0x4, %i3
2129         bgu,pt  %ncc, .alwdcp
2130         add     %i0, 0x4, %i0
2131 
2132         ba      .chksrc
2133         nop
2134 
2135         ! dst & src 2B aligned
2136 .alhlfwdcp:
2137         lduh    [%i1], %o2
2138         stuh    %o2, [%i0]
2139         add     %i1, 0x2, %i1
2140         subcc   %i3, 0x2, %i3
2141         bgu,pt  %ncc, .alhlfwdcp
2142         add     %i0, 0x2, %i0
2143 
2144         ba      .chksrc
2145         nop
2146 
2147         ! dst & src 8B aligned
2148 .alewdcp:
2149         ldx     [%i1], %o2
2150         stx     %o2, [%i0]
2151         add     %i1, 0x8, %i1
2152         subcc   %i3, 0x8, %i3
2153         bgu,pt  %ncc, .alewdcp
2154         add     %i0, 0x8, %i0
2155 
2156         ! Now Destination is block (64 bytes) aligned
2157 .chksrc:
2158         andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
2159         sub     %i2, %i3, %i2           ! Residue bytes in %i2
2160 
2161         mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2162 
2163         andcc   %i1, 0xf, %o2           ! is src quadword aligned
2164         bz,pn   %xcc, .blkcpy           ! src offset in %o2
2165         nop
2166         cmp     %o2, 0x8
2167         bg      .cpy_upper_double
2168         nop
2169         bl      .cpy_lower_double
2170         nop
2171 
2172         ! Falls through when source offset is equal to 8 i.e.
2173         ! source is double word aligned.
2174         ! In this case no shift/merge of data is required
2175         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
2176         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
2177         prefetch [%l0+0x0], #one_read
2178         ldda    [%i1+0x0]%asi, %l2
2179 loop0:
2180         ldda    [%i1+0x10]%asi, %l4
2181         prefetch [%l0+0x40], #one_read
2182 
2183         stxa    %l3, [%i0+0x0]%asi
2184         stxa    %l4, [%i0+0x8]%asi
2185 
2186         ldda    [%i1+0x20]%asi, %l2
2187         stxa    %l5, [%i0+0x10]%asi
2188         stxa    %l2, [%i0+0x18]%asi
2189 
2190         ldda    [%i1+0x30]%asi, %l4
2191         stxa    %l3, [%i0+0x20]%asi
2192         stxa    %l4, [%i0+0x28]%asi
2193 
2194         ldda    [%i1+0x40]%asi, %l2
2195         stxa    %l5, [%i0+0x30]%asi
2196         stxa    %l2, [%i0+0x38]%asi
2197 
2198         add     %l0, 0x40, %l0
2199         add     %i1, 0x40, %i1
2200         subcc   %i3, 0x40, %i3
2201         bgu,pt  %xcc, loop0
2202         add     %i0, 0x40, %i0
2203         ba      .blkdone
2204         add     %i1, %o2, %i1           ! increment the source by src offset
2205                                         ! the src offset was stored in %o2
2206 
2207 .cpy_lower_double:
2208         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
2209         sll     %o2, 3, %o0             ! %o0 left shift
2210         mov     0x40, %o1
2211         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
2212         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
2213         prefetch [%l0+0x0], #one_read
2214         ldda    [%i1+0x0]%asi, %l2      ! partial data in %l2 and %l3 has
2215                                         ! complete data
2216 loop1:
2217         ldda    [%i1+0x10]%asi, %l4     ! %l4 has partial data for this read.
2218         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
2219                                                         ! into %l2 and %l3
2220         prefetch [%l0+0x40], #one_read
2221         stxa    %l2, [%i0+0x0]%asi
2222         stxa    %l3, [%i0+0x8]%asi
2223 
2224         ldda    [%i1+0x20]%asi, %l2
2225         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
2226         stxa    %l4, [%i0+0x10]%asi                     ! %l4 from previous read
2227         stxa    %l5, [%i0+0x18]%asi                     ! into %l4 and %l5
2228 
2229         ! Repeat the same for next 32 bytes.
2230 
2231         ldda    [%i1+0x30]%asi, %l4
2232         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2233         stxa    %l2, [%i0+0x20]%asi
2234         stxa    %l3, [%i0+0x28]%asi
2235 
2236         ldda    [%i1+0x40]%asi, %l2
2237         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2238         stxa    %l4, [%i0+0x30]%asi
2239         stxa    %l5, [%i0+0x38]%asi
2240 
2241         add     %l0, 0x40, %l0
2242         add     %i1, 0x40, %i1
2243         subcc   %i3, 0x40, %i3
2244         bgu,pt  %xcc, loop1
2245         add     %i0, 0x40, %i0
2246         ba      .blkdone
2247         add     %i1, %o2, %i1           ! increment the source by src offset
2248                                         ! the src offset was stored in %o2
2249 
2250 .cpy_upper_double:
2251         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
2252         mov     0x8, %o0
2253         sub     %o2, %o0, %o0
2254         sll     %o0, 3, %o0             ! %o0 left shift
2255         mov     0x40, %o1
2256         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
2257         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
2258         prefetch [%l0+0x0], #one_read
2259         ldda    [%i1+0x0]%asi, %l2      ! partial data in %l3 for this read and
2260                                         ! no data in %l2
2261 loop2:
2262         ldda    [%i1+0x10]%asi, %l4     ! %l4 has complete data and %l5 has
2263                                         ! partial
2264         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
2265                                                         ! into %l3 and %l4
2266         prefetch [%l0+0x40], #one_read
2267         stxa    %l3, [%i0+0x0]%asi
2268         stxa    %l4, [%i0+0x8]%asi
2269 
2270         ldda    [%i1+0x20]%asi, %l2
2271         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
2272         stxa    %l5, [%i0+0x10]%asi                     ! %l5 from previous read
2273         stxa    %l2, [%i0+0x18]%asi                     ! into %l5 and %l2
2274 
2275         ! Repeat the same for next 32 bytes.
2276 
2277         ldda    [%i1+0x30]%asi, %l4
2278         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2279         stxa    %l3, [%i0+0x20]%asi
2280         stxa    %l4, [%i0+0x28]%asi
2281 
2282         ldda    [%i1+0x40]%asi, %l2
2283         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2284         stxa    %l5, [%i0+0x30]%asi
2285         stxa    %l2, [%i0+0x38]%asi
2286 
2287         add     %l0, 0x40, %l0
2288         add     %i1, 0x40, %i1
2289         subcc   %i3, 0x40, %i3
2290         bgu,pt  %xcc, loop2
2291         add     %i0, 0x40, %i0
2292         ba      .blkdone
2293         add     %i1, %o2, %i1           ! increment the source by src offset
2294                                         ! the src offset was stored in %o2
2295 
2296 
2297         ! Both Source and Destination are block aligned.
2298         ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2299 .blkcpy:
2300         prefetch [%i1+0x0], #one_read
2301 1:
2302         ldda    [%i1+0x0]%asi, %l0
2303         ldda    [%i1+0x10]%asi, %l2
2304         prefetch [%i1+0x40], #one_read
2305 
2306         stxa    %l0, [%i0+0x0]%asi
2307         ldda    [%i1+0x20]%asi, %l4
2308         ldda    [%i1+0x30]%asi, %l6
2309 
2310         stxa    %l1, [%i0+0x8]%asi
2311         stxa    %l2, [%i0+0x10]%asi
2312         stxa    %l3, [%i0+0x18]%asi
2313         stxa    %l4, [%i0+0x20]%asi
2314         stxa    %l5, [%i0+0x28]%asi
2315         stxa    %l6, [%i0+0x30]%asi
2316         stxa    %l7, [%i0+0x38]%asi
2317 
2318         add     %i1, 0x40, %i1
2319         subcc   %i3, 0x40, %i3
2320         bgu,pt  %xcc, 1b
2321         add     %i0, 0x40, %i0
2322 
2323 .blkdone:
2324         membar  #Sync
2325 
2326         brz,pt  %i2, .blkexit
2327         nop
2328 
2329         ! Handle trailing bytes
2330         cmp     %i2, 0x8
2331         blu,pt  %ncc, .residue
2332         nop
2333 
2334         ! Can we do some 8B ops
2335         or      %i1, %i0, %o2
2336         andcc   %o2, 0x7, %g0
2337         bnz     %ncc, .last4
2338         nop
2339 
2340         ! Do 8byte ops as long as possible
2341 .last8:
2342         ldx     [%i1], %o2
2343         stx     %o2, [%i0]
2344         add     %i1, 0x8, %i1
2345         sub     %i2, 0x8, %i2
2346         cmp     %i2, 0x8
2347         bgu,pt  %ncc, .last8
2348         add     %i0, 0x8, %i0
2349 
2350         brz,pt  %i2, .blkexit
2351         nop
2352 
2353         ba      .residue
2354         nop
2355 
2356 .last4:
2357         ! Can we do 4B ops
2358         andcc   %o2, 0x3, %g0
2359         bnz     %ncc, .last2
2360         nop
2361 1:
2362         ld      [%i1], %o2
2363         st      %o2, [%i0]
2364         add     %i1, 0x4, %i1
2365         sub     %i2, 0x4, %i2
2366         cmp     %i2, 0x4
2367         bgu,pt  %ncc, 1b
2368         add     %i0, 0x4, %i0
2369 
2370         brz,pt  %i2, .blkexit
2371         nop
2372 
2373         ba      .residue
2374         nop
2375 
2376 .last2:
2377         ! Can we do 2B ops
2378         andcc   %o2, 0x1, %g0
2379         bnz     %ncc, .residue
2380         nop
2381 
2382 1:
2383         lduh    [%i1], %o2
2384         stuh    %o2, [%i0]
2385         add     %i1, 0x2, %i1
2386         sub     %i2, 0x2, %i2
2387         cmp     %i2, 0x2
2388         bgu,pt  %ncc, 1b
2389         add     %i0, 0x2, %i0
2390 
2391         brz,pt  %i2, .blkexit
2392         nop
2393 
2394 .residue:
2395         ldub    [%i1], %o2
2396         stb     %o2, [%i0]
2397         inc     %i1
2398         deccc   %i2
2399         bgu,pt  %ncc, .residue
2400         inc     %i0
2401 
2402 .blkexit:
2403 
2404         membar  #Sync                           ! sync error barrier
2405         ! Restore t_lofault handler, if came here from kcopy().
2406         tst     %o5
2407         bz      %ncc, 1f
2408         andn    %o5, LOFAULT_SET, %o5
2409         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2410 1:
2411         ret
2412         restore %g0, 0, %o0
2413 
2414 
2415 .bcb_punt:
2416         !
2417         ! use aligned transfers where possible
2418         !
2419         xor     %i0, %i1, %o4           ! xor from and to address
2420         btst    7, %o4                  ! if lower three bits zero
2421         bz      .aldoubcp               ! can align on double boundary
2422         .empty  ! assembler complaints about label
2423 
2424         xor     %i0, %i1, %o4           ! xor from and to address
2425         btst    3, %o4                  ! if lower two bits zero
2426         bz      .alwordcp               ! can align on word boundary
2427         btst    3, %i0                  ! delay slot, from address unaligned?
2428         !
2429         ! use aligned reads and writes where possible
2430         ! this differs from wordcp in that it copes
2431         ! with odd alignment between source and destnation
2432         ! using word reads and writes with the proper shifts
2433         ! in between to align transfers to and from memory
2434         ! i0 - src address, i1 - dest address, i2 - count
2435         ! i3, i4 - tmps for used generating complete word
2436         ! i5 (word to write)
2437         ! l0 size in bits of upper part of source word (US)
2438         ! l1 size in bits of lower part of source word (LS = 32 - US)
2439         ! l2 size in bits of upper part of destination word (UD)
2440         ! l3 size in bits of lower part of destination word (LD = 32 - UD)
2441         ! l4 number of bytes leftover after aligned transfers complete
2442         ! l5 the number 32
2443         !
2444         mov     32, %l5                 ! load an oft-needed constant
2445         bz      .align_dst_only
2446         btst    3, %i1                  ! is destnation address aligned?
2447         clr     %i4                     ! clear registers used in either case
2448         bz      .align_src_only
2449         clr     %l0
2450         !
2451         ! both source and destination addresses are unaligned
2452         !
2453 1:                                      ! align source
2454         ldub    [%i0], %i3              ! read a byte from source address
2455         add     %i0, 1, %i0             ! increment source address
2456         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
2457         btst    3, %i0                  ! is source aligned?
2458         add     %l0, 8, %l0             ! increment size of upper source (US)
2459         bnz,a   1b
2460         sll     %i4, 8, %i4             ! make room for next byte
2461 
2462         sub     %l5, %l0, %l1           ! generate shift left count (LS)
2463         sll     %i4, %l1, %i4           ! prepare to get rest
2464         ld      [%i0], %i3              ! read a word
2465         add     %i0, 4, %i0             ! increment source address
2466         srl     %i3, %l0, %i5           ! upper src bits into lower dst bits
2467         or      %i4, %i5, %i5           ! merge
2468         mov     24, %l3                 ! align destination
2469 1:
2470         srl     %i5, %l3, %i4           ! prepare to write a single byte
2471         stb     %i4, [%i1]              ! write a byte
2472         add     %i1, 1, %i1             ! increment destination address
2473         sub     %i2, 1, %i2             ! decrement count
2474         btst    3, %i1                  ! is destination aligned?
2475         bnz,a   1b
2476         sub     %l3, 8, %l3             ! delay slot, decrement shift count (LD)
2477         sub     %l5, %l3, %l2           ! generate shift left count (UD)
2478         sll     %i5, %l2, %i5           ! move leftover into upper bytes
2479         cmp     %l2, %l0                ! cmp # reqd to fill dst w old src left
2480         bgu     %ncc, .more_needed      ! need more to fill than we have
2481         nop
2482 
2483         sll     %i3, %l1, %i3           ! clear upper used byte(s)
2484         srl     %i3, %l1, %i3
2485         ! get the odd bytes between alignments
2486         sub     %l0, %l2, %l0           ! regenerate shift count
2487         sub     %l5, %l0, %l1           ! generate new shift left count (LS)
2488         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
2489         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
2490         srl     %i3, %l0, %i4
2491         or      %i5, %i4, %i5
2492         st      %i5, [%i1]              ! write a word
2493         subcc   %i2, 4, %i2             ! decrement count
2494         bz      %ncc, .unalign_out
2495         add     %i1, 4, %i1             ! increment destination address
2496 
2497         b       2f
2498         sll     %i3, %l1, %i5           ! get leftover into upper bits
2499 .more_needed:
2500         sll     %i3, %l0, %i3           ! save remaining byte(s)
2501         srl     %i3, %l0, %i3
2502         sub     %l2, %l0, %l1           ! regenerate shift count
2503         sub     %l5, %l1, %l0           ! generate new shift left count
2504         sll     %i3, %l1, %i4           ! move to fill empty space
2505         b       3f
2506         or      %i5, %i4, %i5           ! merge to complete word
2507         !
2508         ! the source address is aligned and destination is not
2509         !
2510 .align_dst_only:
2511         ld      [%i0], %i4              ! read a word
2512         add     %i0, 4, %i0             ! increment source address
2513         mov     24, %l0                 ! initial shift alignment count
2514 1:
2515         srl     %i4, %l0, %i3           ! prepare to write a single byte
2516         stb     %i3, [%i1]              ! write a byte
2517         add     %i1, 1, %i1             ! increment destination address
2518         sub     %i2, 1, %i2             ! decrement count
2519         btst    3, %i1                  ! is destination aligned?
2520         bnz,a   1b
2521         sub     %l0, 8, %l0             ! delay slot, decrement shift count
2522 .xfer:
2523         sub     %l5, %l0, %l1           ! generate shift left count
2524         sll     %i4, %l1, %i5           ! get leftover
2525 3:
2526         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
2527         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
2528 2:
2529         ld      [%i0], %i3              ! read a source word
2530         add     %i0, 4, %i0             ! increment source address
2531         srl     %i3, %l0, %i4           ! upper src bits into lower dst bits
2532         or      %i5, %i4, %i5           ! merge with upper dest bits (leftover)
2533         st      %i5, [%i1]              ! write a destination word
2534         subcc   %i2, 4, %i2             ! decrement count
2535         bz      %ncc, .unalign_out      ! check if done
2536         add     %i1, 4, %i1             ! increment destination address
2537         b       2b                      ! loop
2538         sll     %i3, %l1, %i5           ! get leftover
2539 .unalign_out:
2540         tst     %l4                     ! any bytes leftover?
2541         bz      %ncc, .cpdone
2542         .empty                          ! allow next instruction in delay slot
2543 1:
2544         sub     %l0, 8, %l0             ! decrement shift
2545         srl     %i3, %l0, %i4           ! upper src byte into lower dst byte
2546         stb     %i4, [%i1]              ! write a byte
2547         subcc   %l4, 1, %l4             ! decrement count
2548         bz      %ncc, .cpdone           ! done?
2549         add     %i1, 1, %i1             ! increment destination
2550         tst     %l0                     ! any more previously read bytes
2551         bnz     %ncc, 1b                ! we have leftover bytes
2552         mov     %l4, %i2                ! delay slot, mv cnt where dbytecp wants
2553         b       .dbytecp                ! let dbytecp do the rest
2554         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
2555         !
2556         ! the destination address is aligned and the source is not
2557         !
2558 .align_src_only:
2559         ldub    [%i0], %i3              ! read a byte from source address
2560         add     %i0, 1, %i0             ! increment source address
2561         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
2562         btst    3, %i0                  ! is source aligned?
2563         add     %l0, 8, %l0             ! increment shift count (US)
2564         bnz,a   .align_src_only
2565         sll     %i4, 8, %i4             ! make room for next byte
2566         b,a     .xfer
2567         !
2568         ! if from address unaligned for double-word moves,
2569         ! move bytes till it is, if count is < 56 it could take
2570         ! longer to align the thing than to do the transfer
2571         ! in word size chunks right away
2572         !
2573 .aldoubcp:
2574         cmp     %i2, 56                 ! if count < 56, use wordcp, it takes
2575         blu,a   %ncc, .alwordcp         ! longer to align doubles than words
2576         mov     3, %o0                  ! mask for word alignment
2577         call    .alignit                ! copy bytes until aligned
2578         mov     7, %o0                  ! mask for double alignment
2579         !
2580         ! source and destination are now double-word aligned
2581         ! i3 has aligned count returned by alignit
2582         !
2583         and     %i2, 7, %i2             ! unaligned leftover count
2584         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
2585 5:
2586         ldx     [%i0+%i1], %o4          ! read from address
2587         stx     %o4, [%i1]              ! write at destination address
2588         subcc   %i3, 8, %i3             ! dec count
2589         bgu     %ncc, 5b
2590         add     %i1, 8, %i1             ! delay slot, inc to address
2591         cmp     %i2, 4                  ! see if we can copy a word
2592         blu     %ncc, .dbytecp          ! if 3 or less bytes use bytecp
2593         .empty
2594         !
2595         ! for leftover bytes we fall into wordcp, if needed
2596         !
2597 .wordcp:
2598         and     %i2, 3, %i2             ! unaligned leftover count
2599 5:
2600         ld      [%i0+%i1], %o4          ! read from address
2601         st      %o4, [%i1]              ! write at destination address
2602         subcc   %i3, 4, %i3             ! dec count
2603         bgu     %ncc, 5b
2604         add     %i1, 4, %i1             ! delay slot, inc to address
2605         b,a     .dbytecp
2606 
2607         ! we come here to align copies on word boundaries
2608 .alwordcp:
2609         call    .alignit                ! go word-align it
2610         mov     3, %o0                  ! bits that must be zero to be aligned
2611         b       .wordcp
2612         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
2613 
2614         !
2615         ! byte copy, works with any alignment
2616         !
2617 .bytecp:
2618         b       .dbytecp
2619         sub     %i0, %i1, %i0           ! i0 gets difference of src and dst
2620 
2621         !
2622         ! differenced byte copy, works with any alignment
2623         ! assumes dest in %i1 and (source - dest) in %i0
2624         !
2625 1:
2626         stb     %o4, [%i1]              ! write to address
2627         inc     %i1                     ! inc to address
2628 .dbytecp:
2629         deccc   %i2                     ! dec count
2630         bgeu,a  %ncc, 1b                ! loop till done
2631         ldub    [%i0+%i1], %o4          ! read from address
2632 .cpdone:
2633 
2634         membar  #Sync                           ! sync error barrier
2635         ! Restore t_lofault handler, if came here from kcopy().
2636         tst     %o5
2637         bz      %ncc, 1f
2638         andn    %o5, LOFAULT_SET, %o5
2639         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2640 1:
2641         ret
2642         restore %g0, 0, %o0             ! return (0)
2643 
2644 /*
2645  * Common code used to align transfers on word and doubleword
2646  * boundaries.  Aligns source and destination and returns a count
2647  * of aligned bytes to transfer in %i3
2648  */
2649 1:
2650         inc     %i0                     ! inc from
2651         stb     %o4, [%i1]              ! write a byte
2652         inc     %i1                     ! inc to
2653         dec     %i2                     ! dec count
2654 .alignit:
2655         btst    %o0, %i0                ! %o0 is bit mask to check for alignment
2656         bnz,a   1b
2657         ldub    [%i0], %o4              ! read next byte
2658 
2659         retl
2660         andn    %i2, %o0, %i3           ! return size of aligned bytes
2661         
2662         SET_SIZE(bcopy)
2663 
2664 #endif  /* NIAGARA_IMPL */
2665 
2666 /*
2667  * Block copy with possibly overlapped operands.
2668  */
2669 
2670         ENTRY(ovbcopy)
2671         tst     %o2                     ! check count
2672         bgu,a   %ncc, 1f                ! nothing to do or bad arguments
2673         subcc   %o0, %o1, %o3           ! difference of from and to address
2674 
2675         retl                            ! return
2676         nop
2677 1:
2678         bneg,a  %ncc, 2f
2679         neg     %o3                     ! if < 0, make it positive
2680 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
2681         bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
2682         .empty                          !   no overlap
2683         cmp     %o0, %o1                ! compare from and to addresses
2684         blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
2685         nop
2686         !
2687         ! Copy forwards.
2688         !
2689 .ov_fwd:
2690         ldub    [%o0], %o3              ! read from address
2691         inc     %o0                     ! inc from address
2692         stb     %o3, [%o1]              ! write to address
2693         deccc   %o2                     ! dec count
2694         bgu     %ncc, .ov_fwd           ! loop till done
2695         inc     %o1                     ! inc to address
2696 
2697         retl                            ! return
2698         nop
2699         !
2700         ! Copy backwards.
2701         !
2702 .ov_bkwd:
2703         deccc   %o2                     ! dec count
2704         ldub    [%o0 + %o2], %o3        ! get byte at end of src
2705         bgu     %ncc, .ov_bkwd          ! loop till done
2706         stb     %o3, [%o1 + %o2]        ! delay slot, store at end of dst
2707 
2708         retl                            ! return
2709         nop
2710         SET_SIZE(ovbcopy)
2711 
2712 /*
2713  * hwblkpagecopy()
2714  *
2715  * Copies exactly one page.  This routine assumes the caller (ppcopy)
2716  * has already disabled kernel preemption and has checked
2717  * use_hw_bcopy.
2718  */
2719         ENTRY(hwblkpagecopy)
2720         save    %sp, -SA(MINFRAME), %sp
2721 
2722         ! %i0 - source address (arg)
2723         ! %i1 - destination address (arg)
2724         ! %i2 - length of region (not arg)
2725 
2726         set     PAGESIZE, %i2
2727 
2728         /*
2729          * Copying exactly one page and PAGESIZE is in mutliple of 0x80. 
2730          */
2731         mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2732         prefetch [%i0+0x0], #one_read
2733         prefetch [%i0+0x40], #one_read
2734 1:
2735         prefetch [%i0+0x80], #one_read
2736         prefetch [%i0+0xc0], #one_read
2737         ldda    [%i0+0x0]%asi, %l0
2738         ldda    [%i0+0x10]%asi, %l2
2739         ldda    [%i0+0x20]%asi, %l4
2740         ldda    [%i0+0x30]%asi, %l6
2741         stxa    %l0, [%i1+0x0]%asi
2742         stxa    %l1, [%i1+0x8]%asi
2743         stxa    %l2, [%i1+0x10]%asi
2744         stxa    %l3, [%i1+0x18]%asi
2745         stxa    %l4, [%i1+0x20]%asi
2746         stxa    %l5, [%i1+0x28]%asi
2747         stxa    %l6, [%i1+0x30]%asi
2748         stxa    %l7, [%i1+0x38]%asi
2749         ldda    [%i0+0x40]%asi, %l0
2750         ldda    [%i0+0x50]%asi, %l2
2751         ldda    [%i0+0x60]%asi, %l4
2752         ldda    [%i0+0x70]%asi, %l6
2753         stxa    %l0, [%i1+0x40]%asi
2754         stxa    %l1, [%i1+0x48]%asi
2755         stxa    %l2, [%i1+0x50]%asi
2756         stxa    %l3, [%i1+0x58]%asi
2757         stxa    %l4, [%i1+0x60]%asi
2758         stxa    %l5, [%i1+0x68]%asi
2759         stxa    %l6, [%i1+0x70]%asi
2760         stxa    %l7, [%i1+0x78]%asi
2761 
2762         add     %i0, 0x80, %i0
2763         subcc   %i2, 0x80, %i2
2764         bgu,pt  %xcc, 1b
2765         add     %i1, 0x80, %i1
2766 
2767         membar #Sync
2768         ret
2769         restore %g0, 0, %o0
2770         SET_SIZE(hwblkpagecopy)
2771 
2772 
2773 /*
2774  * Transfer data to and from user space -
2775  * Note that these routines can cause faults
2776  * It is assumed that the kernel has nothing at
2777  * less than KERNELBASE in the virtual address space.
2778  *
2779  * Note that copyin(9F) and copyout(9F) are part of the
2780  * DDI/DKI which specifies that they return '-1' on "errors."
2781  *
2782  * Sigh.
2783  *
2784  * So there's two extremely similar routines - xcopyin() and xcopyout()
2785  * which return the errno that we've faithfully computed.  This
2786  * allows other callers (e.g. uiomove(9F)) to work correctly.
2787  * Given that these are used pretty heavily, we expand the calling
2788  * sequences inline for all flavours (rather than making wrappers).
2789  *
2790  * There are also stub routines for xcopyout_little and xcopyin_little,
2791  * which currently are intended to handle requests of <= 16 bytes from
2792  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2793  * is left as an exercise...
2794  */
2795 
2796 /*
2797  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2798  *
2799  * General theory of operation:
2800  *
2801  * None of the copyops routines grab a window until it's decided that
2802  * we need to do a HW block copy operation. This saves a window
2803  * spill/fill when we're called during socket ops. The typical IO
2804  * path won't cause spill/fill traps.
2805  *
2806  * This code uses a set of 4 limits for the maximum size that will
2807  * be copied given a particular input/output address alignment.
2808  * the default limits are:
2809  *
2810  * single byte aligned - 256 (hw_copy_limit_1)
2811  * two byte aligned - 512 (hw_copy_limit_2)
2812  * four byte aligned - 1024 (hw_copy_limit_4)
2813  * eight byte aligned - 1024 (hw_copy_limit_8)
2814  *
2815  * If the value for a particular limit is zero, the copy will be done
2816  * via the copy loops rather than block store/quad load instructions.
2817  *
2818  * Flow:
2819  *
2820  * If count == zero return zero.
2821  *
2822  * Store the previous lo_fault handler into %g6.
2823  * Place our secondary lofault handler into %g5.
2824  * Place the address of our nowindow fault handler into %o3.
2825  * Place the address of the windowed fault handler into %o4.
2826  * --> We'll use this handler if we end up grabbing a window
2827  * --> before we use block initializing store and quad load ASIs
2828  *
2829  * If count is less than or equal to SMALL_LIMIT (7) we
2830  * always do a byte for byte copy.
2831  *
2832  * If count is > SMALL_LIMIT, we check the alignment of the input
2833  * and output pointers. Based on the alignment we check count
2834  * against a limit based on detected alignment.  If we exceed the
2835  * alignment value we copy via block initializing store and quad
2836  * load instructions.
2837  *
2838  * If we don't exceed one of the limits, we store -count in %o3,
2839  * we store the number of chunks (8, 4, 2 or 1 byte) operated
2840  * on in our basic copy loop in %o2. Following this we branch 
2841  * to the appropriate copy loop and copy that many chunks.
2842  * Since we've been adding the chunk size to %o3 each time through
2843  * as well as decrementing %o2, we can tell if any data is
2844  * is left to be copied by examining %o3. If that is zero, we're
2845  * done and can go home. If not, we figure out what the largest
2846  * chunk size left to be copied is and branch to that copy loop
2847  * unless there's only one byte left. We load that as we're
2848  * branching to code that stores it just before we return.
2849  *
2850  * Fault handlers are invoked if we reference memory that has no
2851  * current mapping.  All forms share the same copyio_fault handler.
2852  * This routine handles fixing up the stack and general housecleaning.
2853  * Each copy operation has a simple fault handler that is then called
2854  * to do the work specific to the invidual operation.  The handler
2855  * for copyOP and xcopyOP are found at the end of individual function.
2856  * The handlers for xcopyOP_little are found at the end of xcopyin_little.
2857  * The handlers for copyOP_noerr are found at the end of copyin_noerr.
2858  */
2859 
2860 /*
2861  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2862  */
2863 
2864 /*
2865  * We save the arguments in the following registers in case of a fault:
2866  *      kaddr - %g2
2867  *      uaddr - %g3
2868  *      count - %g4
2869  */
2870 #define SAVE_SRC        %g2
2871 #define SAVE_DST        %g3
2872 #define SAVE_COUNT      %g4
2873 
2874 #define REAL_LOFAULT            %g5
2875 #define SAVED_LOFAULT           %g6
2876 
2877 /*
2878  * Generic copyio fault handler.  This is the first line of defense when a 
2879  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2880  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2881  * This allows us to share common code for all the flavors of the copy
2882  * operations, including the _noerr versions.
2883  *
2884  * Note that this function will restore the original input parameters before
2885  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2886  * member of the t_copyop structure, if needed.
2887  */
2888         ENTRY(copyio_fault)
2889 #if !defined(NIAGARA_IMPL)
2890         btst    FPUSED_FLAG, SAVED_LOFAULT
2891         bz      1f
2892         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2893 
2894         wr      %l5, 0, %gsr            ! restore gsr
2895 
2896         btst    FPRS_FEF, %g1
2897         bz      %icc, 4f
2898         nop
2899 
2900         ! restore fpregs from stack
2901         BLD_FP_FROMSTACK(%o2)
2902 
2903         ba,pt   %ncc, 1f
2904         nop
2905 4:
2906         FZERO                           ! zero all of the fpregs
2907         wr      %g1, %g0, %fprs         ! restore fprs
2908 1:
2909         restore
2910         mov     SAVE_SRC, %o0
2911         mov     SAVE_DST, %o1
2912         jmp     REAL_LOFAULT
2913         mov     SAVE_COUNT, %o2
2914 
2915 #else   /* NIAGARA_IMPL */
2916         membar  #Sync
2917         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2918         restore
2919         mov     SAVE_SRC, %o0
2920         mov     SAVE_DST, %o1
2921         jmp     REAL_LOFAULT
2922         mov     SAVE_COUNT, %o2
2923 
2924 #endif  /* NIAGARA_IMPL */
2925 
2926         SET_SIZE(copyio_fault)
2927 
2928         ENTRY(copyio_fault_nowindow)
2929         membar  #Sync
2930         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2931 
2932         mov     SAVE_SRC, %o0
2933         mov     SAVE_DST, %o1
2934         jmp     REAL_LOFAULT
2935         mov     SAVE_COUNT, %o2
2936         SET_SIZE(copyio_fault_nowindow)
2937 
2938         ENTRY(copyout)
2939         sethi   %hi(.copyout_err), REAL_LOFAULT
2940         or      REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2941 
2942 #if !defined(NIAGARA_IMPL)
2943 .do_copyout:
2944         tst     %o2                     ! check for zero count;  quick exit
2945         bz,pt   %ncc, .co_smallqx
2946         mov     %o0, SAVE_SRC
2947         mov     %o1, SAVE_DST
2948         mov     %o2, SAVE_COUNT
2949         cmp     %o2, FP_COPY            ! check for small copy/leaf case
2950         bgt,pt  %ncc, .co_copy_more
2951         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2952 /*
2953  * Small copy out code
2954  * 
2955  */
2956         sethi   %hi(copyio_fault_nowindow), %o3
2957         or      %o3, %lo(copyio_fault_nowindow), %o3
2958         membar  #Sync
2959         stn     %o3, [THREAD_REG + T_LOFAULT]
2960 
2961         mov     ASI_USER, %asi
2962         cmp     %o2, SHORTCOPY          ! make sure there is enough to align
2963         ble,pt  %ncc, .co_smallest
2964         andcc   %o1, 0x7, %o3           ! is dest long word aligned
2965         bnz,pn  %ncc, .co_align
2966         andcc   %o1, 1, %o3             ! is dest byte aligned
2967 
2968 ! Destination is long word aligned
2969 ! 8 cases for src alignment; load parts, store long words
2970 .co_al_src:
2971         andcc   %o0, 7, %o3
2972         brnz,pt %o3, .co_src_dst_unal8
2973         nop
2974 /*
2975  * Special case for handling when src and dest are both long word aligned
2976  * and total data to move is less than FP_COPY bytes
2977  * Also handles finish up for large block moves, so may be less than 32 bytes
2978  */
2979 .co_medlong:
2980         subcc   %o2, 31, %o2            ! adjust length to allow cc test
2981         ble,pt  %ncc, .co_medl31
2982         nop
2983 .co_medl32:
2984         ldx     [%o0], %o4              ! move 32 bytes
2985         subcc   %o2, 32, %o2            ! decrement length count by 32
2986         stxa    %o4, [%o1]%asi
2987         ldx     [%o0+8], %o4
2988         stxa    %o4, [%o1+8]%asi
2989         ldx     [%o0+16], %o4
2990         add     %o0, 32, %o0            ! increase src ptr by 32
2991         stxa    %o4, [%o1+16]%asi
2992         ldx     [%o0-8], %o4
2993         add     %o1, 32, %o1            ! increase dst ptr by 32
2994         bgu,pt  %ncc, .co_medl32        ! repeat if at least 32 bytes left
2995         stxa    %o4, [%o1-8]%asi
2996 .co_medl31:
2997         addcc   %o2, 24, %o2            ! adjust count to be off by 7
2998         ble,pt  %ncc, .co_medl7         ! skip if 7 or fewer bytes left
2999         nop
3000 .co_medl8:
3001         ldx     [%o0], %o4              ! move 8 bytes
3002         add     %o0, 8, %o0             ! increase src ptr by 8
3003         subcc   %o2, 8, %o2             ! decrease count by 8
3004         add     %o1, 8, %o1             ! increase dst ptr by 8
3005         bgu,pt  %ncc, .co_medl8
3006         stxa    %o4, [%o1-8]%asi
3007 .co_medl7:
3008         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
3009         bnz,pt  %ncc, .co_small4        ! do final bytes if not finished
3010 
3011 .co_smallx:                             ! finish up and exit
3012         membar  #Sync
3013         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3014 .co_smallqx:
3015         retl
3016         mov     %g0, %o0
3017 
3018 .co_small4:
3019         cmp     %o2, 4
3020         blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
3021         nop                             !
3022         ld      [%o0], %o4              ! move 4 bytes
3023         add     %o0, 4, %o0             ! increase src ptr by 4
3024         add     %o1, 4, %o1             ! increase dst ptr by 4
3025         subcc   %o2, 4, %o2             ! decrease count by 4
3026         bz,pt   %ncc, .co_smallx
3027         stwa    %o4, [%o1-4]%asi
3028 
3029 .co_small3x:                            ! Exactly 1, 2, or 3 bytes remain
3030         subcc   %o2, 1, %o2             ! reduce count for cc test
3031         ldub    [%o0], %o4              ! load one byte
3032         bz,pt   %ncc, .co_smallx
3033         stba    %o4, [%o1]%asi          ! store one byte
3034         ldub    [%o0+1], %o4            ! load second byte
3035         subcc   %o2, 1, %o2
3036         bz,pt   %ncc, .co_smallx
3037         stba    %o4, [%o1+1]%asi        ! store second byte
3038         ldub    [%o0+2], %o4            ! load third byte
3039         ba      .co_smallx
3040         stba    %o4, [%o1+2]%asi        ! store third byte
3041 
3042 .co_smallest:                           ! 7 or fewer bytes remain
3043         cmp     %o2, 4
3044         blt,pt  %ncc, .co_small3x
3045         nop
3046         ldub    [%o0], %o4              ! read byte
3047         subcc   %o2, 4, %o2             ! reduce count by 4
3048         stba    %o4, [%o1]%asi          ! write byte
3049         ldub    [%o0+1], %o4            ! repeat for total of 4 bytes
3050         add     %o0, 4, %o0             ! advance src by 4
3051         stba    %o4, [%o1+1]%asi
3052         ldub    [%o0-2], %o4
3053         add     %o1, 4, %o1             ! advance dst by 4
3054         stba    %o4, [%o1-2]%asi
3055         ldub    [%o0-1], %o4
3056         bnz,pt  %ncc, .co_small3x
3057         stba    %o4, [%o1-1]%asi
3058         membar  #Sync
3059         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3060         retl
3061         mov     %g0, %o0
3062 
3063 .co_align:                              ! byte align test in prior branch delay
3064         bnz,pt  %ncc, .co_al_d1
3065 .co_al_d1f:                             ! dest is now half word aligned
3066         andcc   %o1, 2, %o3
3067         bnz,pt  %ncc, .co_al_d2
3068 .co_al_d2f:                             ! dest is now word aligned
3069         andcc   %o1, 4, %o3             ! is dest longword aligned?
3070         bz,pt   %ncc, .co_al_src
3071         nop
3072 .co_al_d4:                              ! dest is word aligned;  src is unknown
3073         ldub    [%o0], %o4              ! move a word (src align unknown)
3074         ldub    [%o0+1], %o3
3075         sll     %o4, 24, %o4            ! position
3076         sll     %o3, 16, %o3            ! position
3077         or      %o4, %o3, %o3           ! merge
3078         ldub    [%o0+2], %o4
3079         sll     %o4, 8, %o4             ! position
3080         or      %o4, %o3, %o3           ! merge
3081         ldub    [%o0+3], %o4
3082         or      %o4, %o3, %o4           ! merge
3083         stwa    %o4,[%o1]%asi           ! store four bytes
3084         add     %o0, 4, %o0             ! adjust src by 4
3085         add     %o1, 4, %o1             ! adjust dest by 4
3086         sub     %o2, 4, %o2             ! adjust count by 4
3087         andcc   %o0, 7, %o3             ! check for src long word alignment
3088         brz,pt  %o3, .co_medlong
3089 .co_src_dst_unal8:
3090         ! dst is 8-byte aligned, src is not
3091         ! Size is less than FP_COPY
3092         ! Following code is to select for alignment
3093         andcc   %o0, 0x3, %o3           ! test word alignment
3094         bz,pt   %ncc, .co_medword
3095         nop
3096         andcc   %o0, 0x1, %o3           ! test halfword alignment
3097         bnz,pt  %ncc, .co_med_byte      ! go to byte move if not halfword
3098         andcc   %o0, 0x2, %o3           ! test which byte alignment
3099         ba      .co_medhalf
3100         nop
3101 .co_al_d1:                              ! align dest to half word
3102         ldub    [%o0], %o4              ! move a byte
3103         add     %o0, 1, %o0
3104         stba    %o4, [%o1]%asi
3105         add     %o1, 1, %o1
3106         andcc   %o1, 2, %o3
3107         bz,pt   %ncc, .co_al_d2f
3108         sub     %o2, 1, %o2
3109 .co_al_d2:                              ! align dest to word
3110         ldub    [%o0], %o4              ! move a half-word (src align unknown)
3111         ldub    [%o0+1], %o3
3112         sll     %o4, 8, %o4             ! position
3113         or      %o4, %o3, %o4           ! merge
3114         stha    %o4, [%o1]%asi
3115         add     %o0, 2, %o0
3116         add     %o1, 2, %o1
3117         andcc   %o1, 4, %o3             ! is dest longword aligned?
3118         bz,pt   %ncc, .co_al_src
3119         sub     %o2, 2, %o2
3120         ba      .co_al_d4
3121         nop
3122 /*
3123  * Handle all cases where src and dest are aligned on word
3124  * boundaries. Use unrolled loops for better performance.
3125  * This option wins over standard large data move when 
3126  * source and destination is in cache for medium
3127  * to short data moves.
3128  */
3129 .co_medword:
3130         subcc   %o2, 31, %o2            ! adjust length to allow cc test
3131         ble,pt  %ncc, .co_medw31
3132         nop
3133 .co_medw32:
3134         ld      [%o0], %o4              ! move a block of 32 bytes
3135         stwa    %o4, [%o1]%asi
3136         ld      [%o0+4], %o4
3137         stwa    %o4, [%o1+4]%asi
3138         ld      [%o0+8], %o4
3139         stwa    %o4, [%o1+8]%asi
3140         ld      [%o0+12], %o4
3141         stwa    %o4, [%o1+12]%asi
3142         ld      [%o0+16], %o4
3143         stwa    %o4, [%o1+16]%asi
3144         ld      [%o0+20], %o4
3145         subcc   %o2, 32, %o2            ! decrement length count
3146         stwa    %o4, [%o1+20]%asi
3147         ld      [%o0+24], %o4
3148         add     %o0, 32, %o0            ! increase src ptr by 32
3149         stwa    %o4, [%o1+24]%asi
3150         ld      [%o0-4], %o4
3151         add     %o1, 32, %o1            ! increase dst ptr by 32
3152         bgu,pt  %ncc, .co_medw32        ! repeat if at least 32 bytes left
3153         stwa    %o4, [%o1-4]%asi
3154 .co_medw31:
3155         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3156         ble,pt  %ncc, .co_medw7         ! skip if 7 or fewer bytes left
3157         nop                             !
3158 .co_medw15:
3159         ld      [%o0], %o4              ! move a block of 8 bytes
3160         subcc   %o2, 8, %o2             ! decrement length count
3161         stwa    %o4, [%o1]%asi
3162         add     %o0, 8, %o0             ! increase src ptr by 8
3163         ld      [%o0-4], %o4
3164         add     %o1, 8, %o1             ! increase dst ptr by 8
3165         bgu,pt  %ncc, .co_medw15
3166         stwa    %o4, [%o1-4]%asi
3167 .co_medw7:
3168         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
3169         bz,pt   %ncc, .co_smallx        ! exit if finished
3170         cmp     %o2, 4
3171         blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
3172         nop                             !
3173         ld      [%o0], %o4              ! move 4 bytes
3174         add     %o0, 4, %o0             ! increase src ptr by 4
3175         add     %o1, 4, %o1             ! increase dst ptr by 4
3176         subcc   %o2, 4, %o2             ! decrease count by 4
3177         bnz     .co_small3x
3178         stwa    %o4, [%o1-4]%asi
3179         membar  #Sync
3180         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3181         retl
3182         mov     %g0, %o0
3183 
3184 .co_medhalf:
3185         subcc   %o2, 31, %o2            ! adjust length to allow cc test
3186         ble,pt  %ncc, .co_medh31
3187         nop
3188 .co_medh32:                             ! load and store block of 32 bytes
3189 
3190         lduh    [%o0], %o4              ! move 32 bytes
3191         subcc   %o2, 32, %o2            ! decrement length count
3192         lduw    [%o0+2], %o3
3193         sllx    %o4, 48, %o4
3194         sllx    %o3, 16, %o3
3195         or      %o4, %o3, %o3
3196         lduh    [%o0+6], %o4
3197         or      %o4, %o3, %o4
3198         stxa    %o4, [%o1]%asi
3199 
3200         lduh    [%o0+8], %o4
3201         lduw    [%o0+10], %o3
3202         sllx    %o4, 48, %o4
3203         sllx    %o3, 16, %o3
3204         or      %o4, %o3, %o3
3205         lduh    [%o0+14], %o4
3206         or      %o4, %o3, %o4
3207         stxa    %o4, [%o1+8]%asi
3208 
3209         lduh    [%o0+16], %o4
3210         lduw    [%o0+18], %o3
3211         sllx    %o4, 48, %o4
3212         sllx    %o3, 16, %o3
3213         or      %o4, %o3, %o3
3214         lduh    [%o0+22], %o4
3215         or      %o4, %o3, %o4
3216         stxa    %o4, [%o1+16]%asi
3217 
3218         add     %o0, 32, %o0            ! increase src ptr by 32
3219         add     %o1, 32, %o1            ! increase dst ptr by 32
3220 
3221         lduh    [%o0-8], %o4
3222         lduw    [%o0-6], %o3
3223         sllx    %o4, 48, %o4
3224         sllx    %o3, 16, %o3
3225         or      %o4, %o3, %o3
3226         lduh    [%o0-2], %o4
3227         or      %o3, %o4, %o4
3228         bgu,pt  %ncc, .co_medh32        ! repeat if at least 32 bytes left
3229         stxa    %o4, [%o1-8]%asi
3230 
3231 .co_medh31:
3232         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3233         ble,pt  %ncc, .co_medh7         ! skip if 7 or fewer bytes left
3234         nop                             !
3235 .co_medh15:
3236         lduh    [%o0], %o4              ! move 16 bytes
3237         subcc   %o2, 8, %o2             ! decrement length count
3238         lduw    [%o0+2], %o3
3239         sllx    %o4, 48, %o4
3240         sllx    %o3, 16, %o3
3241         or      %o4, %o3, %o3
3242         add     %o1, 8, %o1             ! increase dst ptr by 8
3243         lduh    [%o0+6], %o4
3244         add     %o0, 8, %o0             ! increase src ptr by 8
3245         or      %o4, %o3, %o4
3246         bgu,pt  %ncc, .co_medh15
3247         stxa    %o4, [%o1-8]%asi
3248 .co_medh7:
3249         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
3250         bz,pt   %ncc, .co_smallx        ! exit if finished
3251         cmp     %o2, 4
3252         blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
3253         nop                             !
3254         lduh    [%o0], %o4
3255         sll     %o4, 16, %o4
3256         lduh    [%o0+2], %o3
3257         or      %o3, %o4, %o4
3258         subcc   %o2, 4, %o2
3259         add     %o0, 4, %o0
3260         add     %o1, 4, %o1
3261         bnz     .co_small3x
3262         stwa    %o4, [%o1-4]%asi
3263         membar  #Sync
3264         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3265         retl
3266         mov     %g0, %o0
3267 
3268         .align 16
3269 .co_med_byte:
3270         bnz,pt  %ncc, .co_medbh32a      ! go to correct byte move
3271         subcc   %o2, 31, %o2            ! adjust length to allow cc test
3272         ble,pt  %ncc, .co_medb31
3273         nop
3274 .co_medb32:                             ! Alignment 1 or 5
3275         subcc   %o2, 32, %o2            ! decrement length count
3276 
3277         ldub    [%o0], %o4              ! load and store a block of 32 bytes
3278         sllx    %o4, 56, %o3
3279         lduh    [%o0+1], %o4
3280         sllx    %o4, 40, %o4
3281         or      %o4, %o3, %o3
3282         lduw    [%o0+3], %o4
3283         sllx    %o4, 8, %o4
3284         or      %o4, %o3, %o3
3285         ldub    [%o0+7], %o4
3286         or      %o4, %o3, %o4
3287         stxa    %o4, [%o1]%asi
3288 
3289         ldub    [%o0+8], %o4
3290         sllx    %o4, 56, %o3
3291         lduh    [%o0+9], %o4
3292         sllx    %o4, 40, %o4
3293         or      %o4, %o3, %o3
3294         lduw    [%o0+11], %o4
3295         sllx    %o4, 8, %o4
3296         or      %o4, %o3, %o3
3297         ldub    [%o0+15], %o4
3298         or      %o4, %o3, %o4
3299         stxa    %o4, [%o1+8]%asi
3300 
3301         ldub    [%o0+16], %o4
3302         sllx    %o4, 56, %o3
3303         lduh    [%o0+17], %o4
3304         sllx    %o4, 40, %o4
3305         or      %o4, %o3, %o3
3306         lduw    [%o0+19], %o4
3307         sllx    %o4, 8, %o4
3308         or      %o4, %o3, %o3
3309         ldub    [%o0+23], %o4
3310         or      %o4, %o3, %o4
3311         stxa    %o4, [%o1+16]%asi
3312 
3313         add     %o0, 32, %o0            ! increase src ptr by 32
3314         add     %o1, 32, %o1            ! increase dst ptr by 32
3315 
3316         ldub    [%o0-8], %o4
3317         sllx    %o4, 56, %o3
3318         lduh    [%o0-7], %o4
3319         sllx    %o4, 40, %o4
3320         or      %o4, %o3, %o3
3321         lduw    [%o0-5], %o4
3322         sllx    %o4, 8, %o4
3323         or      %o4, %o3, %o3
3324         ldub    [%o0-1], %o4
3325         or      %o4, %o3, %o4
3326         bgu,pt  %ncc, .co_medb32        ! repeat if at least 32 bytes left
3327         stxa    %o4, [%o1-8]%asi
3328 
3329 .co_medb31:                             ! 31 or fewer bytes remaining
3330         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3331         ble,pt  %ncc, .co_medb7         ! skip if 7 or fewer bytes left
3332         nop                             !
3333 .co_medb15:
3334 
3335         ldub    [%o0], %o4              ! load and store a block of 8 bytes
3336         subcc   %o2, 8, %o2             ! decrement length count
3337         sllx    %o4, 56, %o3
3338         lduh    [%o0+1], %o4
3339         sllx    %o4, 40, %o4
3340         or      %o4, %o3, %o3
3341         lduw    [%o0+3], %o4
3342         add     %o1, 8, %o1             ! increase dst ptr by 16
3343         sllx    %o4, 8, %o4
3344         or      %o4, %o3, %o3
3345         ldub    [%o0+7], %o4
3346         add     %o0, 8, %o0             ! increase src ptr by 16
3347         or      %o4, %o3, %o4
3348         bgu,pt  %ncc, .co_medb15
3349         stxa    %o4, [%o1-8]%asi
3350 .co_medb7:
3351         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
3352         bz,pt   %ncc, .co_smallx        ! exit if finished
3353         cmp     %o2, 4
3354         blt,pt  %ncc, .co_small3x       ! skip if less than 4 bytes left
3355         nop                             !
3356         ldub    [%o0], %o4              ! move 4 bytes
3357         sll     %o4, 24, %o3
3358         lduh    [%o0+1], %o4
3359         sll     %o4, 8, %o4
3360         or      %o4, %o3, %o3
3361         ldub    [%o0+3], %o4
3362         or      %o4, %o3, %o4
3363         subcc   %o2, 4, %o2
3364         add     %o0, 4, %o0
3365         add     %o1, 4, %o1
3366         bnz     .co_small3x
3367         stwa    %o4, [%o1-4]%asi
3368         membar  #Sync
3369         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3370         retl
3371         mov     %g0, %o0
3372 
3373         .align 16
3374 .co_medbh32a:
3375         ble,pt  %ncc, .co_medbh31
3376         nop
3377 .co_medbh32:                            ! Alignment 3 or 7
3378         subcc   %o2, 32, %o2            ! decrement length count
3379 
3380         ldub    [%o0], %o4              ! load and store a block of 32 bytes
3381         sllx    %o4, 56, %o3
3382         lduw    [%o0+1], %o4
3383         sllx    %o4, 24, %o4
3384         or      %o4, %o3, %o3
3385         lduh    [%o0+5], %o4
3386         sllx    %o4, 8, %o4
3387         or      %o4, %o3, %o3
3388         ldub    [%o0+7], %o4
3389         or      %o4, %o3, %o4
3390         stxa    %o4, [%o1]%asi
3391 
3392         ldub    [%o0+8], %o4
3393         sllx    %o4, 56, %o3
3394         lduw    [%o0+9], %o4
3395         sllx    %o4, 24, %o4
3396         or      %o4, %o3, %o3
3397         lduh    [%o0+13], %o4
3398         sllx    %o4, 8, %o4
3399         or      %o4, %o3, %o3
3400         ldub    [%o0+15], %o4
3401         or      %o4, %o3, %o4
3402         stxa    %o4, [%o1+8]%asi
3403 
3404         ldub    [%o0+16], %o4
3405         sllx    %o4, 56, %o3
3406         lduw    [%o0+17], %o4
3407         sllx    %o4, 24, %o4
3408         or      %o4, %o3, %o3
3409         lduh    [%o0+21], %o4
3410         sllx    %o4, 8, %o4
3411         or      %o4, %o3, %o3
3412         ldub    [%o0+23], %o4
3413         or      %o4, %o3, %o4
3414         stxa    %o4, [%o1+16]%asi
3415 
3416         add     %o0, 32, %o0            ! increase src ptr by 32
3417         add     %o1, 32, %o1            ! increase dst ptr by 32
3418 
3419         ldub    [%o0-8], %o4
3420         sllx    %o4, 56, %o3
3421         lduw    [%o0-7], %o4
3422         sllx    %o4, 24, %o4
3423         or      %o4, %o3, %o3
3424         lduh    [%o0-3], %o4
3425         sllx    %o4, 8, %o4
3426         or      %o4, %o3, %o3
3427         ldub    [%o0-1], %o4
3428         or      %o4, %o3, %o4
3429         bgu,pt  %ncc, .co_medbh32       ! repeat if at least 32 bytes left
3430         stxa    %o4, [%o1-8]%asi
3431 
3432 .co_medbh31:
3433         addcc   %o2, 24, %o2            ! adjust count to be off by 7
3434         ble,pt  %ncc, .co_medb7         ! skip if 7 or fewer bytes left
3435         nop                             !
3436 .co_medbh15:
3437         ldub    [%o0], %o4              ! load and store a block of 8 bytes
3438         sllx    %o4, 56, %o3
3439         lduw    [%o0+1], %o4
3440         sllx    %o4, 24, %o4
3441         or      %o4, %o3, %o3
3442         lduh    [%o0+5], %o4
3443         sllx    %o4, 8, %o4
3444         or      %o4, %o3, %o3
3445         ldub    [%o0+7], %o4
3446         or      %o4, %o3, %o4
3447         stxa    %o4, [%o1]%asi
3448         subcc   %o2, 8, %o2             ! decrement length count
3449         add     %o1, 8, %o1             ! increase dst ptr by 8
3450         add     %o0, 8, %o0             ! increase src ptr by 8
3451         bgu,pt  %ncc, .co_medbh15
3452         stxa    %o4, [%o1-8]%asi
3453         ba      .co_medb7
3454         nop
3455 /*
3456  * End of small copy (no window) code
3457  */
3458 
3459 /*
3460  * Long copy code
3461  */
3462 .co_copy_more:
3463         sethi   %hi(copyio_fault), %o3
3464         or      %o3, %lo(copyio_fault), %o3
3465         membar  #Sync
3466         stn     %o3, [THREAD_REG + T_LOFAULT]
3467 
3468 /*
3469  * Following code is for large copies. We know there is at
3470  * least FP_COPY bytes available. FP regs are used, so
3471  *  we save registers and fp regs before starting
3472  */
3473         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3474         or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3475         rd      %fprs, %g1              ! check for unused fp
3476         ! if fprs.fef == 0, set it.
3477         ! Setting it when already set costs more than checking
3478         andcc   %g1, FPRS_FEF, %g1      ! test FEF, fprs.du = fprs.dl = 0
3479         bz,pt   %ncc, .co_fp_unused
3480         mov     ASI_USER, %asi
3481         BST_FP_TOSTACK(%o3)
3482         ba      .co_fp_ready
3483 .co_fp_unused:
3484         prefetch [%i0 + (1 * CACHE_LINE)], #one_read
3485         wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
3486 .co_fp_ready:
3487         rd      %gsr, %l5               ! save %gsr value
3488         andcc   %i1, 1, %o3             ! is dest byte aligned
3489         bnz,pt  %ncc, .co_big_d1
3490 .co_big_d1f:                            ! dest is now half word aligned
3491         andcc   %i1, 2, %o3
3492         bnz,pt  %ncc, .co_big_d2
3493 .co_big_d2f:                            ! dest is now word aligned
3494         andcc   %i1, 4, %o3             ! is dest longword aligned
3495         bnz,pt  %ncc, .co_big_d4
3496 .co_big_d4f:                            ! dest is now long word aligned
3497         andcc   %i0, 7, %o3             ! is src long word aligned
3498         brnz,pt %o3, .co_big_unal8
3499         prefetch [%i0 + (2 * CACHE_LINE)], #one_read
3500         ! Src and dst are long word aligned
3501         ! align dst to 64 byte boundary
3502         andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
3503         brz,pn  %o3, .co_al_to_64
3504         nop
3505         sub     %o3, 64, %o3            ! %o3 has negative bytes to move
3506         add     %i2, %o3, %i2           ! adjust remaining count
3507         andcc   %o3, 8, %o4             ! odd long words to move?
3508         brz,pt  %o4, .co_al_to_16
3509         nop
3510         add     %o3, 8, %o3
3511         ldx     [%i0], %o4
3512         add     %i0, 8, %i0             ! increment src ptr
3513         stxa    %o4, [%i1]ASI_USER
3514         add     %i1, 8, %i1             ! increment dst ptr
3515 ! Dest is aligned on 16 bytes, src 8 byte aligned
3516 .co_al_to_16:
3517         andcc   %o3, 0x30, %o4          ! move to move?
3518         brz,pt  %o4, .co_al_to_64
3519         nop
3520 .co_al_mv_16:
3521         add     %o3, 16, %o3
3522         ldx     [%i0], %o4
3523         stxa    %o4, [%i1]ASI_USER
3524         add     %i0, 16, %i0            ! increment src ptr
3525         ldx     [%i0-8], %o4
3526         add     %i1, 8, %i1             ! increment dst ptr
3527         stxa    %o4, [%i1]ASI_USER
3528         andcc   %o3, 0x30, %o4
3529         brnz,pt %o4, .co_al_mv_16
3530         add     %i1, 8, %i1             ! increment dst ptr
3531 ! Dest is aligned on 64 bytes, src 8 byte aligned
3532 .co_al_to_64:
3533         ! Determine source alignment
3534         ! to correct 8 byte offset
3535         andcc   %i0, 32, %o3
3536         brnz,pn %o3, .co_aln_1
3537         andcc   %i0, 16, %o3
3538         brnz,pn %o3, .co_aln_01
3539         andcc   %i0, 8, %o3
3540         brz,pn  %o3, .co_aln_000
3541         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3542         ba      .co_aln_001
3543         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3544 .co_aln_01:
3545         brnz,pn %o3, .co_aln_011
3546         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3547         ba      .co_aln_010
3548         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3549 .co_aln_1:
3550         andcc   %i0, 16, %o3
3551         brnz,pn %o3, .co_aln_11
3552         andcc   %i0, 8, %o3
3553         brnz,pn %o3, .co_aln_101
3554         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3555         ba      .co_aln_100
3556         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3557 .co_aln_11:
3558         brz,pn  %o3, .co_aln_110
3559         prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3560 
3561 .co_aln_111:
3562 ! Alignment off by 8 bytes
3563         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3564         ldd     [%i0], %d0
3565         add     %i0, 8, %i0
3566         sub     %i2, 8, %i2
3567         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3568         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3569         sub     %i1, %i0, %i1
3570 .co_aln_111_loop:
3571         ldda    [%i0]ASI_BLK_P,%d16             ! block load
3572         subcc   %o3, 64, %o3
3573         fmovd   %d16, %d2
3574         fmovd   %d18, %d4
3575         fmovd   %d20, %d6
3576         fmovd   %d22, %d8
3577         fmovd   %d24, %d10
3578         fmovd   %d26, %d12
3579         fmovd   %d28, %d14
3580         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3581         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3582         add     %i0, 64, %i0
3583         fmovd   %d30, %d0
3584         bgt,pt  %ncc, .co_aln_111_loop
3585         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3586         add     %i1, %i0, %i1
3587 
3588         stda    %d0, [%i1]ASI_USER
3589         ba      .co_remain_stuff
3590         add     %i1, 8, %i1
3591         ! END OF aln_111
3592 
3593 .co_aln_110:
3594 ! Alignment off by 16 bytes
3595         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3596         ldd     [%i0], %d0
3597         ldd     [%i0+8], %d2
3598         add     %i0, 16, %i0
3599         sub     %i2, 16, %i2
3600         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3601         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3602         sub     %i1, %i0, %i1
3603 .co_aln_110_loop:
3604         ldda    [%i0]ASI_BLK_P,%d16             ! block load
3605         subcc   %o3, 64, %o3
3606         fmovd   %d16, %d4
3607         fmovd   %d18, %d6
3608         fmovd   %d20, %d8
3609         fmovd   %d22, %d10
3610         fmovd   %d24, %d12
3611         fmovd   %d26, %d14
3612         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3613         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3614         add     %i0, 64, %i0
3615         fmovd   %d28, %d0
3616         fmovd   %d30, %d2
3617         bgt,pt  %ncc, .co_aln_110_loop
3618         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3619         add     %i1, %i0, %i1
3620 
3621         stda    %d0, [%i1]%asi
3622         stda    %d2, [%i1+8]%asi
3623         ba      .co_remain_stuff
3624         add     %i1, 16, %i1
3625         ! END OF aln_110
3626 
3627 .co_aln_101:
3628 ! Alignment off by 24 bytes
3629         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3630         ldd     [%i0], %d0
3631         ldd     [%i0+8], %d2
3632         ldd     [%i0+16], %d4
3633         add     %i0, 24, %i0
3634         sub     %i2, 24, %i2
3635         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3636         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3637         sub     %i1, %i0, %i1
3638 .co_aln_101_loop:
3639         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3640         subcc   %o3, 64, %o3
3641         fmovd   %d16, %d6
3642         fmovd   %d18, %d8
3643         fmovd   %d20, %d10
3644         fmovd   %d22, %d12
3645         fmovd   %d24, %d14
3646         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3647         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3648         add     %i0, 64, %i0
3649         fmovd   %d26, %d0
3650         fmovd   %d28, %d2
3651         fmovd   %d30, %d4
3652         bgt,pt  %ncc, .co_aln_101_loop
3653         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3654         add     %i1, %i0, %i1
3655 
3656         stda    %d0, [%i1]%asi
3657         stda    %d2, [%i1+8]%asi
3658         stda    %d4, [%i1+16]%asi
3659         ba      .co_remain_stuff
3660         add     %i1, 24, %i1
3661         ! END OF aln_101
3662 
3663 .co_aln_100:
3664 ! Alignment off by 32 bytes
3665         ldd     [%i0], %d0
3666         ldd     [%i0+8], %d2
3667         ldd     [%i0+16],%d4
3668         ldd     [%i0+24],%d6
3669         add     %i0, 32, %i0
3670         sub     %i2, 32, %i2
3671         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3672         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3673         sub     %i1, %i0, %i1
3674 .co_aln_100_loop:
3675         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3676         subcc   %o3, 64, %o3
3677         fmovd   %d16, %d8
3678         fmovd   %d18, %d10
3679         fmovd   %d20, %d12
3680         fmovd   %d22, %d14
3681         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3682         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3683         add     %i0, 64, %i0
3684         fmovd   %d24, %d0
3685         fmovd   %d26, %d2
3686         fmovd   %d28, %d4
3687         fmovd   %d30, %d6
3688         bgt,pt  %ncc, .co_aln_100_loop
3689         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3690         add     %i1, %i0, %i1
3691 
3692         stda    %d0, [%i1]%asi
3693         stda    %d2, [%i1+8]%asi
3694         stda    %d4, [%i1+16]%asi
3695         stda    %d6, [%i1+24]%asi
3696         ba      .co_remain_stuff
3697         add     %i1, 32, %i1
3698         ! END OF aln_100
3699 
3700 .co_aln_011:
3701 ! Alignment off by 40 bytes
3702         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3703         ldd     [%i0], %d0
3704         ldd     [%i0+8], %d2
3705         ldd     [%i0+16], %d4
3706         ldd     [%i0+24], %d6
3707         ldd     [%i0+32], %d8
3708         add     %i0, 40, %i0
3709         sub     %i2, 40, %i2
3710         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3711         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3712         sub     %i1, %i0, %i1
3713 .co_aln_011_loop:
3714         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3715         subcc   %o3, 64, %o3
3716         fmovd   %d16, %d10
3717         fmovd   %d18, %d12
3718         fmovd   %d20, %d14
3719         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3720         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3721         add     %i0, 64, %i0
3722         fmovd   %d22, %d0
3723         fmovd   %d24, %d2
3724         fmovd   %d26, %d4
3725         fmovd   %d28, %d6
3726         fmovd   %d30, %d8
3727         bgt,pt  %ncc, .co_aln_011_loop
3728         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3729         add     %i1, %i0, %i1
3730 
3731         stda    %d0, [%i1]%asi
3732         stda    %d2, [%i1+8]%asi
3733         stda    %d4, [%i1+16]%asi
3734         stda    %d6, [%i1+24]%asi
3735         stda    %d8, [%i1+32]%asi
3736         ba      .co_remain_stuff
3737         add     %i1, 40, %i1
3738         ! END OF aln_011
3739 
3740 .co_aln_010:
3741 ! Alignment off by 48 bytes
3742         ldd     [%i0], %d0
3743         ldd     [%i0+8], %d2
3744         ldd     [%i0+16], %d4
3745         ldd     [%i0+24], %d6
3746         ldd     [%i0+32], %d8
3747         ldd     [%i0+40], %d10
3748         add     %i0, 48, %i0
3749         sub     %i2, 48, %i2
3750         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3751         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3752         sub     %i1, %i0, %i1
3753 .co_aln_010_loop:
3754         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3755         subcc   %o3, 64, %o3
3756         fmovd   %d16, %d12
3757         fmovd   %d18, %d14
3758         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3759         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3760         add     %i0, 64, %i0
3761         fmovd   %d20, %d0
3762         fmovd   %d22, %d2
3763         fmovd   %d24, %d4
3764         fmovd   %d26, %d6
3765         fmovd   %d28, %d8
3766         fmovd   %d30, %d10
3767         bgt,pt  %ncc, .co_aln_010_loop
3768         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3769         add     %i1, %i0, %i1
3770 
3771         stda    %d0, [%i1]%asi
3772         stda    %d2, [%i1+8]%asi
3773         stda    %d4, [%i1+16]%asi
3774         stda    %d6, [%i1+24]%asi
3775         stda    %d8, [%i1+32]%asi
3776         stda    %d10, [%i1+40]%asi
3777         ba      .co_remain_stuff
3778         add     %i1, 48, %i1
3779         ! END OF aln_010
3780 
3781 .co_aln_001:
3782 ! Alignment off by 56 bytes
3783         ldd     [%i0], %d0
3784         ldd     [%i0+8], %d2
3785         ldd     [%i0+16], %d4
3786         ldd     [%i0+24], %d6
3787         ldd     [%i0+32], %d8
3788         ldd     [%i0+40], %d10
3789         ldd     [%i0+48], %d12
3790         add     %i0, 56, %i0
3791         sub     %i2, 56, %i2
3792         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3793         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3794         sub     %i1, %i0, %i1
3795 .co_aln_001_loop:
3796         ldda    [%i0]ASI_BLK_P,%d16     ! block load
3797         subcc   %o3, 64, %o3
3798         fmovd   %d16, %d14
3799         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3800         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3801         add     %i0, 64, %i0
3802         fmovd   %d18, %d0
3803         fmovd   %d20, %d2
3804         fmovd   %d22, %d4
3805         fmovd   %d24, %d6
3806         fmovd   %d26, %d8
3807         fmovd   %d28, %d10
3808         fmovd   %d30, %d12
3809         bgt,pt  %ncc, .co_aln_001_loop
3810         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3811         add     %i1, %i0, %i1
3812 
3813         stda    %d0, [%i1]%asi
3814         stda    %d2, [%i1+8]%asi
3815         stda    %d4, [%i1+16]%asi
3816         stda    %d6, [%i1+24]%asi
3817         stda    %d8, [%i1+32]%asi
3818         stda    %d10, [%i1+40]%asi
3819         stda    %d12, [%i1+48]%asi
3820         ba      .co_remain_stuff
3821         add     %i1, 56, %i1
3822         ! END OF aln_001
3823 
3824 .co_aln_000:
3825         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3826         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
3827         and     %i2, 0x7f, %i2          ! residue bytes in %i2
3828         sub     %i1, %i0, %i1
3829 .co_aln_000_loop:
3830         ldda    [%i0]ASI_BLK_P,%d0
3831         subcc   %o3, 64, %o3
3832         stxa    %g0,[%i0+%i1]ASI_STBI_AIUS      ! block initializing store
3833         stda    %d0,[%i0+%i1]ASI_BLK_AIUS
3834         add     %i0, 64, %i0
3835         bgt,pt  %ncc, .co_aln_000_loop
3836         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3837         add     %i1, %i0, %i1
3838 
3839         ! END OF aln_000
3840 
3841 .co_remain_stuff:
3842         subcc   %i2, 31, %i2            ! adjust length to allow cc test
3843         ble,pt  %ncc, .co_aln_31
3844         nop
3845 .co_aln_32:
3846         ldx     [%i0], %o4              ! move 32 bytes
3847         subcc   %i2, 32, %i2            ! decrement length count by 32
3848         stxa    %o4, [%i1]%asi
3849         ldx     [%i0+8], %o4
3850         stxa    %o4, [%i1+8]%asi
3851         ldx     [%i0+16], %o4
3852         add     %i0, 32, %i0            ! increase src ptr by 32
3853         stxa    %o4, [%i1+16]%asi
3854         ldx     [%i0-8], %o4
3855         add     %i1, 32, %i1            ! increase dst ptr by 32
3856         bgu,pt  %ncc, .co_aln_32        ! repeat if at least 32 bytes left
3857         stxa    %o4, [%i1-8]%asi
3858 .co_aln_31:
3859         addcc   %i2, 24, %i2            ! adjust count to be off by 7
3860         ble,pt  %ncc, .co_aln_7         ! skip if 7 or fewer bytes left
3861         nop                             !
3862 .co_aln_15:
3863         ldx     [%i0], %o4              ! move 8 bytes
3864         add     %i0, 8, %i0             ! increase src ptr by 8
3865         subcc   %i2, 8, %i2             ! decrease count by 8
3866         add     %i1, 8, %i1             ! increase dst ptr by 8
3867         bgu,pt  %ncc, .co_aln_15
3868         stxa    %o4, [%i1-8]%asi
3869 .co_aln_7:
3870         addcc   %i2, 7, %i2             ! finish adjustment of remaining count
3871         bz,pt   %ncc, .co_exit          ! exit if finished
3872         cmp     %i2, 4
3873         blt,pt  %ncc, .co_unaln3x       ! skip if less than 4 bytes left
3874         nop                             !
3875         ld      [%i0], %o4              ! move 4 bytes
3876         add     %i0, 4, %i0             ! increase src ptr by 4
3877         add     %i1, 4, %i1             ! increase dst ptr by 4
3878         subcc   %i2, 4, %i2             ! decrease count by 4
3879         bnz     .co_unaln3x
3880         stwa    %o4, [%i1-4]%asi
3881         ba      .co_exit
3882         nop
3883 
3884         ! destination alignment code
3885 .co_big_d1:
3886         ldub    [%i0], %o4              ! move a byte
3887         add     %i0, 1, %i0
3888         stba    %o4, [%i1]ASI_USER
3889         add     %i1, 1, %i1
3890         andcc   %i1, 2, %o3
3891         bz,pt   %ncc, .co_big_d2f
3892         sub     %i2, 1, %i2
3893 .co_big_d2:
3894         ldub    [%i0], %o4              ! move a half-word (src align unknown)
3895         ldub    [%i0+1], %o3
3896         add     %i0, 2, %i0
3897         sll     %o4, 8, %o4             ! position
3898         or      %o4, %o3, %o4           ! merge
3899         stha    %o4, [%i1]ASI_USER
3900         add     %i1, 2, %i1
3901         andcc   %i1, 4, %o3             ! is dest longword aligned
3902         bz,pt   %ncc, .co_big_d4f
3903         sub     %i2, 2, %i2
3904 .co_big_d4:                             ! dest is at least word aligned
3905         nop
3906         ldub    [%i0], %o4              ! move a word (src align unknown)
3907         ldub    [%i0+1], %o3
3908         sll     %o4, 24, %o4            ! position
3909         sll     %o3, 16, %o3            ! position
3910         or      %o4, %o3, %o3           ! merge
3911         ldub    [%i0+2], %o4
3912         sll     %o4, 8, %o4             ! position
3913         or      %o4, %o3, %o3           ! merge
3914         ldub    [%i0+3], %o4
3915         or      %o4, %o3, %o4           ! merge
3916         stwa    %o4,[%i1]ASI_USER       ! store four bytes
3917         add     %i0, 4, %i0             ! adjust src by 4
3918         add     %i1, 4, %i1             ! adjust dest by 4
3919         ba      .co_big_d4f
3920         sub     %i2, 4, %i2             ! adjust count by 4
3921 
3922 
3923         ! Dst is on 8 byte boundary; src is not;
3924 .co_big_unal8:
3925         andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
3926         bz      %ncc, .co_unalnsrc
3927         sub     %o3, 64, %o3            ! %o3 will be multiple of 8
3928         neg     %o3                     ! bytes until dest is 64 byte aligned
3929         sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
3930         ! Move bytes according to source alignment
3931         andcc   %i0, 0x1, %o4
3932         bnz     %ncc, .co_unalnbyte     ! check for byte alignment
3933         nop
3934         andcc   %i0, 2, %o4             ! check for half word alignment
3935         bnz     %ncc, .co_unalnhalf
3936         nop
3937         ! Src is word aligned, move bytes until dest 64 byte aligned
3938 .co_unalnword:
3939         ld      [%i0], %o4              ! load 4 bytes
3940         stwa    %o4, [%i1]%asi          ! and store 4 bytes
3941         ld      [%i0+4], %o4            ! load 4 bytes
3942         add     %i0, 8, %i0             ! increase src ptr by 8
3943         stwa    %o4, [%i1+4]%asi        ! and store 4 bytes
3944         subcc   %o3, 8, %o3             ! decrease count by 8
3945         bnz     %ncc, .co_unalnword
3946         add     %i1, 8, %i1             ! increase dst ptr by 8
3947         ba      .co_unalnsrc
3948         nop
3949 
3950         ! Src is half-word aligned, move bytes until dest 64 byte aligned
3951 .co_unalnhalf:
3952         lduh    [%i0], %o4              ! load 2 bytes
3953         sllx    %o4, 32, %i3            ! shift left
3954         lduw    [%i0+2], %o4
3955         or      %o4, %i3, %i3
3956         sllx    %i3, 16, %i3
3957         lduh    [%i0+6], %o4
3958         or      %o4, %i3, %i3
3959         stxa    %i3, [%i1]ASI_USER
3960         add     %i0, 8, %i0
3961         subcc   %o3, 8, %o3
3962         bnz     %ncc, .co_unalnhalf
3963         add     %i1, 8, %i1
3964         ba      .co_unalnsrc
3965         nop
3966 
3967         ! Src is Byte aligned, move bytes until dest 64 byte aligned
3968 .co_unalnbyte:
3969         sub     %i1, %i0, %i1           ! share pointer advance
3970 .co_unalnbyte_loop:
3971         ldub    [%i0], %o4
3972         sllx    %o4, 56, %i3
3973         lduh    [%i0+1], %o4
3974         sllx    %o4, 40, %o4
3975         or      %o4, %i3, %i3
3976         lduh    [%i0+3], %o4
3977         sllx    %o4, 24, %o4
3978         or      %o4, %i3, %i3
3979         lduh    [%i0+5], %o4
3980         sllx    %o4, 8, %o4
3981         or      %o4, %i3, %i3
3982         ldub    [%i0+7], %o4
3983         or      %o4, %i3, %i3
3984         stxa    %i3, [%i1+%i0]ASI_USER
3985         subcc   %o3, 8, %o3
3986         bnz     %ncc, .co_unalnbyte_loop
3987         add     %i0, 8, %i0
3988         add     %i1,%i0, %i1            ! restore pointer
3989 
3990         ! Destination is now block (64 byte aligned), src is not 8 byte aligned
3991 .co_unalnsrc:
3992         andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
3993         and     %i2, 0x3f, %i2          ! residue bytes in %i2
3994         add     %i2, 64, %i2            ! Insure we don't load beyond
3995         sub     %i3, 64, %i3            ! end of source buffer
3996 
3997         andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
3998         prefetch [%o4 + (3 * CACHE_LINE)], #one_read
3999         alignaddr %i0, %g0, %g0         ! generate %gsr
4000         add     %i0, %i3, %i0           ! advance %i0 to after blocks
4001         !
4002         ! Determine source alignment to correct 8 byte offset
4003         andcc   %i0, 0x20, %o3
4004         brnz,pn %o3, .co_unaln_1
4005         andcc   %i0, 0x10, %o3
4006         brnz,pn %o3, .co_unaln_01
4007         andcc   %i0, 0x08, %o3
4008         brz,a   %o3, .co_unaln_000
4009         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4010         ba      .co_unaln_001
4011         nop
4012 .co_unaln_01:
4013         brnz,a  %o3, .co_unaln_011
4014         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4015         ba      .co_unaln_010
4016         nop
4017 .co_unaln_1:
4018         brnz,pn %o3, .co_unaln_11
4019         andcc   %i0, 0x08, %o3
4020         brnz,a  %o3, .co_unaln_101
4021         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4022         ba      .co_unaln_100
4023         nop
4024 .co_unaln_11:
4025         brz,pn  %o3, .co_unaln_110
4026         prefetch [%i0 + (4 * CACHE_LINE)], #one_read
4027 
4028 .co_unaln_111:
4029         ldd     [%o4+56], %d14
4030 .co_unaln_111_loop:
4031         add     %o4, 64, %o4
4032         ldda    [%o4]ASI_BLK_P, %d16
4033         faligndata %d14, %d16, %d48
4034         faligndata %d16, %d18, %d50
4035         faligndata %d18, %d20, %d52
4036         faligndata %d20, %d22, %d54
4037         faligndata %d22, %d24, %d56
4038         faligndata %d24, %d26, %d58
4039         faligndata %d26, %d28, %d60
4040         faligndata %d28, %d30, %d62
4041         fmovd   %d30, %d14
4042         stda    %d48, [%i1]ASI_BLK_AIUS
4043         subcc   %i3, 64, %i3
4044         add     %i1, 64, %i1
4045         bgu,pt  %ncc, .co_unaln_111_loop
4046         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4047         ba      .co_unaln_done
4048         nop
4049 
4050 .co_unaln_110:
4051         ldd     [%o4+48], %d12
4052         ldd     [%o4+56], %d14
4053 .co_unaln_110_loop:
4054         add     %o4, 64, %o4
4055         ldda    [%o4]ASI_BLK_P, %d16
4056         faligndata %d12, %d14, %d48
4057         faligndata %d14, %d16, %d50
4058         faligndata %d16, %d18, %d52
4059         faligndata %d18, %d20, %d54
4060         faligndata %d20, %d22, %d56
4061         faligndata %d22, %d24, %d58
4062         faligndata %d24, %d26, %d60
4063         faligndata %d26, %d28, %d62
4064         fmovd   %d28, %d12
4065         fmovd   %d30, %d14
4066         stda    %d48, [%i1]ASI_BLK_AIUS
4067         subcc   %i3, 64, %i3
4068         add     %i1, 64, %i1
4069         bgu,pt  %ncc, .co_unaln_110_loop
4070         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4071         ba      .co_unaln_done
4072         nop
4073 
4074 .co_unaln_101:
4075         ldd     [%o4+40], %d10
4076         ldd     [%o4+48], %d12
4077         ldd     [%o4+56], %d14
4078 .co_unaln_101_loop:
4079         add     %o4, 64, %o4
4080         ldda    [%o4]ASI_BLK_P, %d16
4081         faligndata %d10, %d12, %d48
4082         faligndata %d12, %d14, %d50
4083         faligndata %d14, %d16, %d52
4084         faligndata %d16, %d18, %d54
4085         faligndata %d18, %d20, %d56
4086         faligndata %d20, %d22, %d58
4087         faligndata %d22, %d24, %d60
4088         faligndata %d24, %d26, %d62
4089         fmovd   %d26, %d10
4090         fmovd   %d28, %d12
4091         fmovd   %d30, %d14
4092         stda    %d48, [%i1]ASI_BLK_AIUS
4093         subcc   %i3, 64, %i3
4094         add     %i1, 64, %i1
4095         bgu,pt  %ncc, .co_unaln_101_loop
4096         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4097         ba      .co_unaln_done
4098         nop
4099 
4100 .co_unaln_100:
4101         ldd     [%o4+32], %d8
4102         ldd     [%o4+40], %d10
4103         ldd     [%o4+48], %d12
4104         ldd     [%o4+56], %d14
4105 .co_unaln_100_loop:
4106         add     %o4, 64, %o4
4107         ldda    [%o4]ASI_BLK_P, %d16
4108         faligndata %d8, %d10, %d48
4109         faligndata %d10, %d12, %d50
4110         faligndata %d12, %d14, %d52
4111         faligndata %d14, %d16, %d54
4112         faligndata %d16, %d18, %d56
4113         faligndata %d18, %d20, %d58
4114         faligndata %d20, %d22, %d60
4115         faligndata %d22, %d24, %d62
4116         fmovd   %d24, %d8
4117         fmovd   %d26, %d10
4118         fmovd   %d28, %d12
4119         fmovd   %d30, %d14
4120         stda    %d48, [%i1]ASI_BLK_AIUS
4121         subcc   %i3, 64, %i3
4122         add     %i1, 64, %i1
4123         bgu,pt  %ncc, .co_unaln_100_loop
4124         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4125         ba      .co_unaln_done
4126         nop
4127 
4128 .co_unaln_011:
4129         ldd     [%o4+24], %d6
4130         ldd     [%o4+32], %d8
4131         ldd     [%o4+40], %d10
4132         ldd     [%o4+48], %d12
4133         ldd     [%o4+56], %d14
4134 .co_unaln_011_loop:
4135         add     %o4, 64, %o4
4136         ldda    [%o4]ASI_BLK_P, %d16
4137         faligndata %d6, %d8, %d48
4138         faligndata %d8, %d10, %d50
4139         faligndata %d10, %d12, %d52
4140         faligndata %d12, %d14, %d54
4141         faligndata %d14, %d16, %d56
4142         faligndata %d16, %d18, %d58
4143         faligndata %d18, %d20, %d60
4144         faligndata %d20, %d22, %d62
4145         fmovd   %d22, %d6
4146         fmovd   %d24, %d8
4147         fmovd   %d26, %d10
4148         fmovd   %d28, %d12
4149         fmovd   %d30, %d14
4150         stda    %d48, [%i1]ASI_BLK_AIUS
4151         subcc   %i3, 64, %i3
4152         add     %i1, 64, %i1
4153         bgu,pt  %ncc, .co_unaln_011_loop
4154         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4155         ba      .co_unaln_done
4156         nop
4157 
4158 .co_unaln_010:
4159         ldd     [%o4+16], %d4
4160         ldd     [%o4+24], %d6
4161         ldd     [%o4+32], %d8
4162         ldd     [%o4+40], %d10
4163         ldd     [%o4+48], %d12
4164         ldd     [%o4+56], %d14
4165 .co_unaln_010_loop:
4166         add     %o4, 64, %o4
4167         ldda    [%o4]ASI_BLK_P, %d16
4168         faligndata %d4, %d6, %d48
4169         faligndata %d6, %d8, %d50
4170         faligndata %d8, %d10, %d52
4171         faligndata %d10, %d12, %d54
4172         faligndata %d12, %d14, %d56
4173         faligndata %d14, %d16, %d58
4174         faligndata %d16, %d18, %d60
4175         faligndata %d18, %d20, %d62
4176         fmovd   %d20, %d4
4177         fmovd   %d22, %d6
4178         fmovd   %d24, %d8
4179         fmovd   %d26, %d10
4180         fmovd   %d28, %d12
4181         fmovd   %d30, %d14
4182         stda    %d48, [%i1]ASI_BLK_AIUS
4183         subcc   %i3, 64, %i3
4184         add     %i1, 64, %i1
4185         bgu,pt  %ncc, .co_unaln_010_loop
4186         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4187         ba      .co_unaln_done
4188         nop
4189 
4190 .co_unaln_001:
4191         ldd     [%o4+8], %d2
4192         ldd     [%o4+16], %d4
4193         ldd     [%o4+24], %d6
4194         ldd     [%o4+32], %d8
4195         ldd     [%o4+40], %d10
4196         ldd     [%o4+48], %d12
4197         ldd     [%o4+56], %d14
4198 .co_unaln_001_loop:
4199         add     %o4, 64, %o4
4200         ldda    [%o4]ASI_BLK_P, %d16
4201         faligndata %d2, %d4, %d48
4202         faligndata %d4, %d6, %d50
4203         faligndata %d6, %d8, %d52
4204         faligndata %d8, %d10, %d54
4205         faligndata %d10, %d12, %d56
4206         faligndata %d12, %d14, %d58
4207         faligndata %d14, %d16, %d60
4208         faligndata %d16, %d18, %d62
4209         fmovd   %d18, %d2
4210         fmovd   %d20, %d4
4211         fmovd   %d22, %d6
4212         fmovd   %d24, %d8
4213         fmovd   %d26, %d10
4214         fmovd   %d28, %d12
4215         fmovd   %d30, %d14
4216         stda    %d48, [%i1]ASI_BLK_AIUS
4217         subcc   %i3, 64, %i3
4218         add     %i1, 64, %i1
4219         bgu,pt  %ncc, .co_unaln_001_loop
4220         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4221         ba      .co_unaln_done
4222         nop
4223 
4224 .co_unaln_000:
4225         ldda    [%o4]ASI_BLK_P, %d0
4226 .co_unaln_000_loop:
4227         add     %o4, 64, %o4
4228         ldda    [%o4]ASI_BLK_P, %d16
4229         faligndata %d0, %d2, %d48
4230         faligndata %d2, %d4, %d50
4231         faligndata %d4, %d6, %d52
4232         faligndata %d6, %d8, %d54
4233         faligndata %d8, %d10, %d56
4234         faligndata %d10, %d12, %d58
4235         faligndata %d12, %d14, %d60
4236         faligndata %d14, %d16, %d62
4237         fmovd   %d16, %d0
4238         fmovd   %d18, %d2
4239         fmovd   %d20, %d4
4240         fmovd   %d22, %d6
4241         fmovd   %d24, %d8
4242         fmovd   %d26, %d10
4243         fmovd   %d28, %d12
4244         fmovd   %d30, %d14
4245         stda    %d48, [%i1]ASI_BLK_AIUS
4246         subcc   %i3, 64, %i3
4247         add     %i1, 64, %i1
4248         bgu,pt  %ncc, .co_unaln_000_loop
4249         prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4250 
4251 .co_unaln_done:
4252         ! Handle trailing bytes, 64 to 127
4253         ! Dest long word aligned, Src not long word aligned
4254         cmp     %i2, 15
4255         bleu    %ncc, .co_unaln_short
4256 
4257         andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
4258         and     %i2, 0x7, %i2           ! residue bytes in %i2
4259         add     %i2, 8, %i2
4260         sub     %i3, 8, %i3             ! insure we don't load past end of src
4261         andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
4262         add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
4263         ldd     [%o4], %d0              ! fetch partial word
4264 .co_unaln_by8:
4265         ldd     [%o4+8], %d2
4266         add     %o4, 8, %o4
4267         faligndata %d0, %d2, %d16
4268         subcc   %i3, 8, %i3
4269         stda    %d16, [%i1]%asi
4270         fmovd   %d2, %d0
4271         bgu,pt  %ncc, .co_unaln_by8
4272         add     %i1, 8, %i1
4273 
4274 .co_unaln_short:
4275         cmp     %i2, 8
4276         blt,pt  %ncc, .co_unalnfin
4277         nop
4278         ldub    [%i0], %o4
4279         sll     %o4, 24, %o3
4280         ldub    [%i0+1], %o4
4281         sll     %o4, 16, %o4
4282         or      %o4, %o3, %o3
4283         ldub    [%i0+2], %o4
4284         sll     %o4, 8, %o4
4285         or      %o4, %o3, %o3
4286         ldub    [%i0+3], %o4
4287         or      %o4, %o3, %o3
4288         stwa    %o3, [%i1]%asi
4289         ldub    [%i0+4], %o4
4290         sll     %o4, 24, %o3
4291         ldub    [%i0+5], %o4
4292         sll     %o4, 16, %o4
4293         or      %o4, %o3, %o3
4294         ldub    [%i0+6], %o4
4295         sll     %o4, 8, %o4
4296         or      %o4, %o3, %o3
4297         ldub    [%i0+7], %o4
4298         or      %o4, %o3, %o3
4299         stwa    %o3, [%i1+4]%asi
4300         add     %i0, 8, %i0
4301         add     %i1, 8, %i1
4302         sub     %i2, 8, %i2
4303 .co_unalnfin:
4304         cmp     %i2, 4
4305         blt,pt  %ncc, .co_unalnz
4306         tst     %i2
4307         ldub    [%i0], %o3              ! read byte
4308         subcc   %i2, 4, %i2             ! reduce count by 4
4309         sll     %o3, 24, %o3            ! position
4310         ldub    [%i0+1], %o4
4311         sll     %o4, 16, %o4            ! position
4312         or      %o4, %o3, %o3           ! merge
4313         ldub    [%i0+2], %o4
4314         sll     %o4, 8, %o4             ! position
4315         or      %o4, %o3, %o3           ! merge
4316         add     %i1, 4, %i1             ! advance dst by 4
4317         ldub    [%i0+3], %o4
4318         add     %i0, 4, %i0             ! advance src by 4
4319         or      %o4, %o3, %o4           ! merge
4320         bnz,pt  %ncc, .co_unaln3x
4321         stwa    %o4, [%i1-4]%asi
4322         ba      .co_exit
4323         nop
4324 .co_unalnz:
4325         bz,pt   %ncc, .co_exit
4326         wr      %l5, %g0, %gsr          ! restore %gsr
4327 .co_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
4328         subcc   %i2, 1, %i2             ! reduce count for cc test
4329         ldub    [%i0], %o4              ! load one byte
4330         bz,pt   %ncc, .co_exit
4331         stba    %o4, [%i1]%asi          ! store one byte
4332         ldub    [%i0+1], %o4            ! load second byte
4333         subcc   %i2, 1, %i2
4334         bz,pt   %ncc, .co_exit
4335         stba    %o4, [%i1+1]%asi        ! store second byte
4336         ldub    [%i0+2], %o4            ! load third byte
4337         stba    %o4, [%i1+2]%asi        ! store third byte
4338 .co_exit:
4339         brnz    %g1, .co_fp_restore
4340         nop
4341         FZERO
4342         wr      %g1, %g0, %fprs
4343         ba,pt   %ncc, .co_ex2
4344         membar  #Sync
4345 .co_fp_restore:
4346         BLD_FP_FROMSTACK(%o4)
4347 .co_ex2:
4348         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4349         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4350         ret
4351         restore %g0, 0, %o0
4352 
4353 .copyout_err:
4354         ldn     [THREAD_REG + T_COPYOPS], %o4
4355         brz     %o4, 2f
4356         nop
4357         ldn     [%o4 + CP_COPYOUT], %g2
4358         jmp     %g2
4359         nop
4360 2:
4361         retl
4362         mov     -1, %o0
4363 
4364 #else   /* NIAGARA_IMPL */
4365 .do_copyout:
4366         !
4367         ! Check the length and bail if zero.
4368         !
4369         tst     %o2
4370         bnz,pt  %ncc, 1f
4371         nop
4372         retl
4373         clr     %o0
4374 1:
4375         sethi   %hi(copyio_fault), %o4
4376         or      %o4, %lo(copyio_fault), %o4
4377         sethi   %hi(copyio_fault_nowindow), %o3
4378         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
4379         or      %o3, %lo(copyio_fault_nowindow), %o3
4380         membar  #Sync
4381         stn     %o3, [THREAD_REG + T_LOFAULT]
4382 
4383         mov     %o0, SAVE_SRC
4384         mov     %o1, SAVE_DST
4385         mov     %o2, SAVE_COUNT
4386 
4387         !
4388         ! Check to see if we're more than SMALL_LIMIT (7 bytes).
4389         ! Run in leaf mode, using the %o regs as our input regs.
4390         !
4391         subcc   %o2, SMALL_LIMIT, %o3
4392         bgu,a,pt %ncc, .dco_ns
4393         or      %o0, %o1, %o3
4394         !
4395         ! What was previously ".small_copyout"
4396         ! Do full differenced copy.
4397         !
4398 .dcobcp:
4399         sub     %g0, %o2, %o3           ! negate count
4400         add     %o0, %o2, %o0           ! make %o0 point at the end
4401         add     %o1, %o2, %o1           ! make %o1 point at the end
4402         ba,pt   %ncc, .dcocl
4403         ldub    [%o0 + %o3], %o4        ! load first byte
4404         !
4405         ! %o0 and %o2 point at the end and remain pointing at the end
4406         ! of their buffers. We pull things out by adding %o3 (which is
4407         ! the negation of the length) to the buffer end which gives us
4408         ! the curent location in the buffers. By incrementing %o3 we walk
4409         ! through both buffers without having to bump each buffer's
4410         ! pointer. A very fast 4 instruction loop.
4411         !
4412         .align 16
4413 .dcocl:
4414         stba    %o4, [%o1 + %o3]ASI_USER
4415         inccc   %o3
4416         bl,a,pt %ncc, .dcocl
4417         ldub    [%o0 + %o3], %o4
4418         !
4419         ! We're done. Go home.
4420         !
4421         membar  #Sync
4422         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
4423         retl
4424         clr     %o0
4425         !
4426         ! Try aligned copies from here.
4427         !
4428 .dco_ns:
4429         ! %o0 = kernel addr (to be copied from)
4430         ! %o1 = user addr (to be copied to)
4431         ! %o2 = length
4432         ! %o3 = %o1 | %o2 (used for alignment checking)
4433         ! %o4 is alternate lo_fault
4434         ! %o5 is original lo_fault
4435         !
4436         ! See if we're single byte aligned. If we are, check the
4437         ! limit for single byte copies. If we're smaller or equal,
4438         ! bounce to the byte for byte copy loop. Otherwise do it in
4439         ! HW (if enabled).
4440         !
4441         btst    1, %o3
4442         bz,pt   %icc, .dcoh8
4443         btst    7, %o3
4444         !
4445         ! Single byte aligned. Do we do it via HW or via
4446         ! byte for byte? Do a quick no memory reference
4447         ! check to pick up small copies.
4448         !
4449         sethi   %hi(hw_copy_limit_1), %o3
4450         !
4451         ! Big enough that we need to check the HW limit for
4452         ! this size copy.
4453         !
4454         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
4455         !
4456         ! Is HW copy on? If not, do everything byte for byte.
4457         !
4458         tst     %o3
4459         bz,pn   %icc, .dcobcp
4460         subcc   %o3, %o2, %o3
4461         !
4462         ! If we're less than or equal to the single byte copy limit,
4463         ! bop to the copy loop.
4464         !
4465         bge,pt  %ncc, .dcobcp
4466         nop
4467         !
4468         ! We're big enough and copy is on. Do it with HW.
4469         !
4470         ba,pt   %ncc, .big_copyout
4471         nop
4472 .dcoh8:
4473         !
4474         ! 8 byte aligned?
4475         !
4476         bnz,a   %ncc, .dcoh4
4477         btst    3, %o3
4478         !
4479         ! See if we're in the "small range".
4480         ! If so, go off and do the copy.
4481         ! If not, load the hard limit. %o3 is
4482         ! available for reuse.
4483         !
4484         sethi   %hi(hw_copy_limit_8), %o3
4485         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
4486         !
4487         ! If it's zero, there's no HW bcopy.
4488         ! Bop off to the aligned copy.
4489         !
4490         tst     %o3
4491         bz,pn   %icc, .dcos8
4492         subcc   %o3, %o2, %o3
4493         !
4494         ! We're negative if our size is larger than hw_copy_limit_8.
4495         !
4496         bge,pt  %ncc, .dcos8
4497         nop
4498         !
4499         ! HW assist is on and we're large enough. Do it.
4500         !
4501         ba,pt   %ncc, .big_copyout
4502         nop
4503 .dcos8:
4504         !
4505         ! Housekeeping for copy loops. Uses same idea as in the byte for
4506         ! byte copy loop above.
4507         !
4508         add     %o0, %o2, %o0
4509         add     %o1, %o2, %o1
4510         sub     %g0, %o2, %o3
4511         ba,pt   %ncc, .dodebc
4512         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
4513         !
4514         ! 4 byte aligned?
4515         !
4516 .dcoh4:
4517         bnz,pn  %ncc, .dcoh2
4518         !
4519         ! See if we're in the "small range".
4520         ! If so, go off an do the copy.
4521         ! If not, load the hard limit. %o3 is
4522         ! available for reuse.
4523         !
4524         sethi   %hi(hw_copy_limit_4), %o3
4525         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
4526         !
4527         ! If it's zero, there's no HW bcopy.
4528         ! Bop off to the aligned copy.
4529         !
4530         tst     %o3
4531         bz,pn   %icc, .dcos4
4532         subcc   %o3, %o2, %o3
4533         !
4534         ! We're negative if our size is larger than hw_copy_limit_4.
4535         !
4536         bge,pt  %ncc, .dcos4
4537         nop
4538         !
4539         ! HW assist is on and we're large enough. Do it.
4540         !
4541         ba,pt   %ncc, .big_copyout
4542         nop
4543 .dcos4:
4544         add     %o0, %o2, %o0
4545         add     %o1, %o2, %o1
4546         sub     %g0, %o2, %o3
4547         ba,pt   %ncc, .dodfbc
4548         srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
4549         !
4550         ! We must be 2 byte aligned. Off we go.
4551         ! The check for small copies was done in the
4552         ! delay at .dcoh4
4553         !
4554 .dcoh2:
4555         ble     %ncc, .dcos2
4556         sethi   %hi(hw_copy_limit_2), %o3
4557         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
4558         tst     %o3
4559         bz,pn   %icc, .dcos2
4560         subcc   %o3, %o2, %o3
4561         bge,pt  %ncc, .dcos2
4562         nop
4563         !
4564         ! HW is on and we're big enough. Do it.
4565         !
4566         ba,pt   %ncc, .big_copyout
4567         nop
4568 .dcos2:
4569         add     %o0, %o2, %o0
4570         add     %o1, %o2, %o1
4571         sub     %g0, %o2, %o3
4572         ba,pt   %ncc, .dodtbc
4573         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
4574 .small_copyout:
4575         !
4576         ! Why are we doing this AGAIN? There are certain conditions in
4577         ! big_copyout that will cause us to forego the HW assisted copies
4578         ! and bounce back to a non-HW assisted copy. This dispatches those
4579         ! copies. Note that we branch around this in the main line code.
4580         !
4581         ! We make no check for limits or HW enablement here. We've
4582         ! already been told that we're a poster child so just go off
4583         ! and do it.
4584         !
4585         or      %o0, %o1, %o3
4586         btst    1, %o3
4587         bnz     %icc, .dcobcp           ! Most likely
4588         btst    7, %o3
4589         bz      %icc, .dcos8
4590         btst    3, %o3
4591         bz      %icc, .dcos4
4592         nop
4593         ba,pt   %ncc, .dcos2
4594         nop
4595         .align 32
4596 .dodebc:
4597         ldx     [%o0 + %o3], %o4
4598         deccc   %o2
4599         stxa    %o4, [%o1 + %o3]ASI_USER
4600         bg,pt   %ncc, .dodebc
4601         addcc   %o3, 8, %o3
4602         !
4603         ! End of copy loop. Check to see if we're done. Most
4604         ! eight byte aligned copies end here.
4605         !
4606         bz,pt   %ncc, .dcofh
4607         nop
4608         !
4609         ! Something is left - do it byte for byte.
4610         ! 
4611         ba,pt   %ncc, .dcocl
4612         ldub    [%o0 + %o3], %o4        ! load next byte
4613         !
4614         ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
4615         !
4616         .align 32
4617 .dodfbc:
4618         lduw    [%o0 + %o3], %o4
4619         deccc   %o2
4620         sta     %o4, [%o1 + %o3]ASI_USER
4621         bg,pt   %ncc, .dodfbc
4622         addcc   %o3, 4, %o3
4623         !
4624         ! End of copy loop. Check to see if we're done. Most
4625         ! four byte aligned copies end here.
4626         !
4627         bz,pt   %ncc, .dcofh
4628         nop
4629         !
4630         ! Something is left. Do it byte for byte.
4631         !
4632         ba,pt   %ncc, .dcocl
4633         ldub    [%o0 + %o3], %o4        ! load next byte
4634         !
4635         ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
4636         ! copy.
4637         !
4638         .align 32
4639 .dodtbc:
4640         lduh    [%o0 + %o3], %o4
4641         deccc   %o2
4642         stha    %o4, [%o1 + %o3]ASI_USER
4643         bg,pt   %ncc, .dodtbc
4644         addcc   %o3, 2, %o3
4645         !
4646         ! End of copy loop. Anything left?
4647         !
4648         bz,pt   %ncc, .dcofh
4649         nop
4650         !
4651         ! Deal with the last byte
4652         !
4653         ldub    [%o0 + %o3], %o4
4654         stba    %o4, [%o1 + %o3]ASI_USER
4655 .dcofh:
4656         membar  #Sync
4657         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4658         retl
4659         clr     %o0
4660 
4661 .big_copyout:
4662         ! We're going to go off and do a block copy.
4663         ! Switch fault handlers and grab a window. We
4664         ! don't do a membar #Sync since we've done only
4665         ! kernel data to this point.
4666         stn     %o4, [THREAD_REG + T_LOFAULT]
4667 
4668         ! Copy out that reach here are larger than 256 bytes. The
4669         ! hw_copy_limit_1 is set to 256. Never set this limit less
4670         ! 128 bytes.
4671         save    %sp, -SA(MINFRAME), %sp
4672 .do_block_copyout:
4673 
4674         ! Swap src/dst since the code below is memcpy code
4675         ! and memcpy/bcopy have different calling sequences
4676         mov     %i1, %i5
4677         mov     %i0, %i1
4678         mov     %i5, %i0
4679 
4680         ! Block (64 bytes) align the destination.
4681         andcc   %i0, 0x3f, %i3          ! is dst block aligned
4682         bz      %ncc, copyout_blalign   ! dst already block aligned
4683         sub     %i3, 0x40, %i3
4684         neg     %i3                     ! bytes till dst 64 bytes aligned
4685         sub     %i2, %i3, %i2           ! update i2 with new count
4686 
4687         ! Based on source and destination alignment do
4688         ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
4689 
4690         ! Is dst & src 8B aligned
4691         or      %i0, %i1, %o2
4692         andcc   %o2, 0x7, %g0
4693         bz      %ncc, .co_alewdcp
4694         nop
4695 
4696         ! Is dst & src 4B aligned
4697         andcc   %o2, 0x3, %g0
4698         bz      %ncc, .co_alwdcp
4699         nop
4700 
4701         ! Is dst & src 2B aligned
4702         andcc   %o2, 0x1, %g0
4703         bz      %ncc, .co_alhlfwdcp
4704         nop
4705 
4706         ! 1B aligned
4707 1:      ldub    [%i1], %o2
4708         stba    %o2, [%i0]ASI_USER
4709         inc     %i1
4710         deccc   %i3
4711         bgu,pt  %ncc, 1b
4712         inc     %i0
4713 
4714         ba      copyout_blalign
4715         nop
4716 
4717         ! dst & src 4B aligned
4718 .co_alwdcp:
4719         ld      [%i1], %o2
4720         sta     %o2, [%i0]ASI_USER
4721         add     %i1, 0x4, %i1
4722         subcc   %i3, 0x4, %i3
4723         bgu,pt  %ncc, .co_alwdcp
4724         add     %i0, 0x4, %i0
4725 
4726         ba      copyout_blalign
4727         nop
4728 
4729         ! dst & src 2B aligned
4730 .co_alhlfwdcp:
4731         lduh    [%i1], %o2
4732         stuha   %o2, [%i0]ASI_USER
4733         add     %i1, 0x2, %i1
4734         subcc   %i3, 0x2, %i3
4735         bgu,pt  %ncc, .co_alhlfwdcp
4736         add     %i0, 0x2, %i0
4737 
4738         ba      copyout_blalign
4739         nop
4740 
4741         ! dst & src 8B aligned
4742 .co_alewdcp:
4743         ldx     [%i1], %o2
4744         stxa    %o2, [%i0]ASI_USER
4745         add     %i1, 0x8, %i1
4746         subcc   %i3, 0x8, %i3
4747         bgu,pt  %ncc, .co_alewdcp
4748         add     %i0, 0x8, %i0
4749 
4750         ! Now Destination is block (64 bytes) aligned
4751 copyout_blalign:
4752         andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
4753         sub     %i2, %i3, %i2           ! Residue bytes in %i2
4754 
4755         mov     ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4756 
4757         andcc   %i1, 0xf, %o2           ! is src quadword aligned
4758         bz,pn   %xcc, .co_blkcpy        ! src offset in %o2 (last 4-bits)
4759         nop
4760         cmp     %o2, 0x8
4761         bg      .co_upper_double
4762         nop
4763         bl      .co_lower_double
4764         nop
4765 
4766         ! Falls through when source offset is equal to 8 i.e.
4767         ! source is double word aligned.
4768         ! In this case no shift/merge of data is required
4769 
4770         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
4771         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
4772         prefetch [%l0+0x0], #one_read
4773         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4774 .co_loop0:
4775         add     %i1, 0x10, %i1
4776         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4777         prefetch [%l0+0x40], #one_read
4778 
4779         stxa    %l3, [%i0+0x0]%asi
4780         stxa    %l4, [%i0+0x8]%asi
4781 
4782         add     %i1, 0x10, %i1
4783         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4784 
4785         stxa    %l5, [%i0+0x10]%asi
4786         stxa    %l2, [%i0+0x18]%asi
4787 
4788         add     %i1, 0x10, %i1
4789         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4790 
4791         stxa    %l3, [%i0+0x20]%asi
4792         stxa    %l4, [%i0+0x28]%asi
4793 
4794         add     %i1, 0x10, %i1
4795         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4796 
4797         stxa    %l5, [%i0+0x30]%asi
4798         stxa    %l2, [%i0+0x38]%asi
4799 
4800         add     %l0, 0x40, %l0
4801         subcc   %i3, 0x40, %i3
4802         bgu,pt  %xcc, .co_loop0
4803         add     %i0, 0x40, %i0
4804         ba      .co_blkdone
4805         add     %i1, %o2, %i1           ! increment the source by src offset
4806                                         ! the src offset was stored in %o2
4807 
4808 .co_lower_double:
4809 
4810         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
4811         sll     %o2, 3, %o0             ! %o0 left shift
4812         mov     0x40, %o1
4813         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
4814         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
4815         prefetch [%l0+0x0], #one_read
4816         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2    ! partial data in %l2 and %l3 has
4817                                         ! complete data
4818 .co_loop1:
4819         add     %i1, 0x10, %i1
4820         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4    ! %l4 has partial data
4821                                                         ! for this read.
4822         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
4823                                                         ! into %l2 and %l3
4824         prefetch [%l0+0x40], #one_read
4825 
4826         stxa    %l2, [%i0+0x0]%asi
4827         stxa    %l3, [%i0+0x8]%asi
4828 
4829         add     %i1, 0x10, %i1
4830         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4831         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
4832                                                         ! %l4 from previous read
4833                                                         ! into %l4 and %l5
4834         stxa    %l4, [%i0+0x10]%asi
4835         stxa    %l5, [%i0+0x18]%asi
4836 
4837         ! Repeat the same for next 32 bytes.
4838 
4839         add     %i1, 0x10, %i1
4840         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4841         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
4842 
4843         stxa    %l2, [%i0+0x20]%asi
4844         stxa    %l3, [%i0+0x28]%asi
4845 
4846         add     %i1, 0x10, %i1
4847         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4848         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
4849 
4850         stxa    %l4, [%i0+0x30]%asi
4851         stxa    %l5, [%i0+0x38]%asi
4852 
4853         add     %l0, 0x40, %l0
4854         subcc   %i3, 0x40, %i3
4855         bgu,pt  %xcc, .co_loop1
4856         add     %i0, 0x40, %i0
4857         ba      .co_blkdone
4858         add     %i1, %o2, %i1           ! increment the source by src offset
4859                                         ! the src offset was stored in %o2
4860 
4861 .co_upper_double:
4862 
4863         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
4864         sub     %o2, 0x8, %o0
4865         sll     %o0, 3, %o0             ! %o0 left shift
4866         mov     0x40, %o1
4867         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
4868         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
4869         prefetch [%l0+0x0], #one_read
4870         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2    ! partial data in %l3
4871                                                         ! for this read and
4872                                                         ! no data in %l2
4873 .co_loop2:
4874         add     %i1, 0x10, %i1
4875         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4    ! %l4 has complete data
4876                                                         ! and %l5 has partial
4877         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
4878                                                         ! into %l3 and %l4
4879         prefetch [%l0+0x40], #one_read
4880 
4881         stxa    %l3, [%i0+0x0]%asi
4882         stxa    %l4, [%i0+0x8]%asi
4883 
4884         add     %i1, 0x10, %i1
4885         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4886         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
4887                                                         ! %l5 from previous read
4888                                                         ! into %l5 and %l2
4889 
4890         stxa    %l5, [%i0+0x10]%asi
4891         stxa    %l2, [%i0+0x18]%asi
4892 
4893         ! Repeat the same for next 32 bytes.
4894 
4895         add     %i1, 0x10, %i1
4896         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4897         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
4898 
4899         stxa    %l3, [%i0+0x20]%asi
4900         stxa    %l4, [%i0+0x28]%asi
4901 
4902         add     %i1, 0x10, %i1
4903         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4904         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
4905 
4906         stxa    %l5, [%i0+0x30]%asi
4907         stxa    %l2, [%i0+0x38]%asi
4908 
4909         add     %l0, 0x40, %l0
4910         subcc   %i3, 0x40, %i3
4911         bgu,pt  %xcc, .co_loop2
4912         add     %i0, 0x40, %i0
4913         ba      .co_blkdone
4914         add     %i1, %o2, %i1           ! increment the source by src offset
4915                                         ! the src offset was stored in %o2
4916 
4917 
4918         ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
4919 .co_blkcpy:
4920 
4921         andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
4922         prefetch [%o0+0x0], #one_read
4923 1:
4924         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
4925         add     %i1, 0x10, %i1
4926         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
4927         add     %i1, 0x10, %i1
4928 
4929         prefetch [%o0+0x40], #one_read
4930 
4931         stxa    %l0, [%i0+0x0]%asi
4932 
4933         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
4934         add     %i1, 0x10, %i1
4935         ldda    [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
4936         add     %i1, 0x10, %i1
4937 
4938         stxa    %l1, [%i0+0x8]%asi
4939         stxa    %l2, [%i0+0x10]%asi
4940         stxa    %l3, [%i0+0x18]%asi
4941         stxa    %l4, [%i0+0x20]%asi
4942         stxa    %l5, [%i0+0x28]%asi
4943         stxa    %l6, [%i0+0x30]%asi
4944         stxa    %l7, [%i0+0x38]%asi
4945 
4946         add     %o0, 0x40, %o0
4947         subcc   %i3, 0x40, %i3
4948         bgu,pt  %xcc, 1b
4949         add     %i0, 0x40, %i0
4950 
4951 .co_blkdone:
4952         membar  #Sync
4953 
4954         brz,pt  %i2, .copyout_exit
4955         nop
4956 
4957         ! Handle trailing bytes
4958         cmp     %i2, 0x8
4959         blu,pt  %ncc, .co_residue
4960         nop
4961 
4962         ! Can we do some 8B ops
4963         or      %i1, %i0, %o2
4964         andcc   %o2, 0x7, %g0
4965         bnz     %ncc, .co_last4
4966         nop
4967 
4968         ! Do 8byte ops as long as possible
4969 .co_last8:
4970         ldx     [%i1], %o2
4971         stxa    %o2, [%i0]ASI_USER
4972         add     %i1, 0x8, %i1
4973         sub     %i2, 0x8, %i2
4974         cmp     %i2, 0x8
4975         bgu,pt  %ncc, .co_last8
4976         add     %i0, 0x8, %i0
4977 
4978         brz,pt  %i2, .copyout_exit
4979         nop
4980 
4981         ba      .co_residue
4982         nop
4983 
4984 .co_last4:
4985         ! Can we do 4B ops
4986         andcc   %o2, 0x3, %g0
4987         bnz     %ncc, .co_last2
4988         nop
4989 1:
4990         ld      [%i1], %o2
4991         sta     %o2, [%i0]ASI_USER
4992         add     %i1, 0x4, %i1
4993         sub     %i2, 0x4, %i2
4994         cmp     %i2, 0x4
4995         bgu,pt  %ncc, 1b
4996         add     %i0, 0x4, %i0
4997 
4998         brz,pt  %i2, .copyout_exit
4999         nop
5000 
5001         ba      .co_residue
5002         nop
5003 
5004 .co_last2:
5005         ! Can we do 2B ops
5006         andcc   %o2, 0x1, %g0
5007         bnz     %ncc, .co_residue
5008         nop
5009 
5010 1:
5011         lduh    [%i1], %o2
5012         stuha   %o2, [%i0]ASI_USER
5013         add     %i1, 0x2, %i1
5014         sub     %i2, 0x2, %i2
5015         cmp     %i2, 0x2
5016         bgu,pt  %ncc, 1b
5017         add     %i0, 0x2, %i0
5018 
5019         brz,pt  %i2, .copyout_exit
5020         nop
5021 
5022         ! Copy the residue as byte copy
5023 .co_residue:
5024         ldub    [%i1], %i4
5025         stba    %i4, [%i0]ASI_USER
5026         inc     %i1
5027         deccc   %i2
5028         bgu,pt  %xcc, .co_residue
5029         inc     %i0
5030 
5031 .copyout_exit:
5032         membar  #Sync
5033         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
5034         ret
5035         restore %g0, 0, %o0
5036 
5037 .copyout_err:
5038         ldn     [THREAD_REG + T_COPYOPS], %o4
5039         brz     %o4, 2f
5040         nop
5041         ldn     [%o4 + CP_COPYOUT], %g2
5042         jmp     %g2
5043         nop
5044 2:
5045         retl
5046         mov     -1, %o0
5047 #endif  /* NIAGARA_IMPL */
5048         SET_SIZE(copyout)
5049 
5050 
5051         ENTRY(xcopyout)
5052         sethi   %hi(.xcopyout_err), REAL_LOFAULT
5053         b       .do_copyout
5054         or      REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
5055 .xcopyout_err:
5056         ldn     [THREAD_REG + T_COPYOPS], %o4
5057         brz     %o4, 2f
5058         nop
5059         ldn     [%o4 + CP_XCOPYOUT], %g2
5060         jmp     %g2
5061         nop
5062 2:
5063         retl
5064         mov     %g1, %o0
5065         SET_SIZE(xcopyout)
5066 
5067         ENTRY(xcopyout_little)
5068         sethi   %hi(.little_err), %o4
5069         ldn     [THREAD_REG + T_LOFAULT], %o5
5070         or      %o4, %lo(.little_err), %o4
5071         membar  #Sync                   ! sync error barrier
5072         stn     %o4, [THREAD_REG + T_LOFAULT]
5073 
5074         subcc   %g0, %o2, %o3
5075         add     %o0, %o2, %o0
5076         bz,pn   %ncc, 2f                ! check for zero bytes
5077         sub     %o2, 1, %o4
5078         add     %o0, %o4, %o0           ! start w/last byte
5079         add     %o1, %o2, %o1
5080         ldub    [%o0+%o3], %o4
5081 
5082 1:      stba    %o4, [%o1+%o3]ASI_AIUSL
5083         inccc   %o3
5084         sub     %o0, 2, %o0             ! get next byte
5085         bcc,a,pt %ncc, 1b
5086         ldub    [%o0+%o3], %o4
5087 
5088 2:      membar  #Sync                   ! sync error barrier
5089         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
5090         retl
5091         mov     %g0, %o0                ! return (0)
5092         SET_SIZE(xcopyout_little)
5093 
5094 /*
5095  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
5096  */
5097 
5098         ENTRY(copyin)
5099         sethi   %hi(.copyin_err), REAL_LOFAULT
5100         or      REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
5101 
5102 #if !defined(NIAGARA_IMPL)
5103 .do_copyin:
5104         tst     %o2                     ! check for zero count;  quick exit
5105         bz,pt   %ncc, .ci_smallqx
5106         mov     %o0, SAVE_SRC
5107         mov     %o1, SAVE_DST
5108         mov     %o2, SAVE_COUNT
5109         cmp     %o2, FP_COPY            ! check for small copy/leaf case
5110         bgt,pt  %ncc, .ci_copy_more
5111         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
5112 /*
5113  * Small copy in code
5114  * 
5115  */
5116         sethi   %hi(copyio_fault_nowindow), %o3
5117         or      %o3, %lo(copyio_fault_nowindow), %o3
5118         membar  #Sync
5119         stn     %o3, [THREAD_REG + T_LOFAULT]
5120 
5121         mov     ASI_USER, %asi
5122         cmp     %o2, SHORTCOPY          ! make sure there is enough to align
5123         ble,pt  %ncc, .ci_smallest
5124         andcc   %o1, 0x7, %o3           ! is dest long word aligned
5125         bnz,pn  %ncc, .ci_align
5126         andcc   %o1, 1, %o3             ! is dest byte aligned
5127 
5128 ! Destination is long word aligned
5129 .ci_al_src:
5130         andcc   %o0, 7, %o3
5131         brnz,pt %o3, .ci_src_dst_unal8
5132         nop
5133 /*
5134  * Special case for handling when src and dest are both long word aligned
5135  * and total data to move is less than FP_COPY bytes
5136  * Also handles finish up for large block moves, so may be less than 32 bytes
5137  */
5138 .ci_medlong:
5139         subcc   %o2, 31, %o2            ! adjust length to allow cc test
5140         ble,pt  %ncc, .ci_medl31
5141         nop
5142 .ci_medl32:
5143         ldxa    [%o0]%asi, %o4          ! move 32 bytes
5144         subcc   %o2, 32, %o2            ! decrement length count by 32
5145         stx     %o4, [%o1]
5146         ldxa    [%o0+8]%asi, %o4
5147         stx     %o4, [%o1+8]
5148         ldxa    [%o0+16]%asi, %o4
5149         add     %o0, 32, %o0            ! increase src ptr by 32
5150         stx     %o4, [%o1+16]
5151         ldxa    [%o0-8]%asi, %o4
5152         add     %o1, 32, %o1            ! increase dst ptr by 32
5153         bgu,pt  %ncc, .ci_medl32        ! repeat if at least 32 bytes left
5154         stx     %o4, [%o1-8]
5155 .ci_medl31:
5156         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5157         ble,pt  %ncc, .ci_medl7         ! skip if 7 or fewer bytes left
5158         nop
5159 .ci_medl8:
5160         ldxa    [%o0]%asi, %o4          ! move 8 bytes
5161         add     %o0, 8, %o0             ! increase src ptr by 8
5162         subcc   %o2, 8, %o2             ! decrease count by 8
5163         add     %o1, 8, %o1             ! increase dst ptr by 8
5164         bgu,pt  %ncc, .ci_medl8
5165         stx     %o4, [%o1-8]
5166 .ci_medl7:
5167         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
5168         bnz,pt  %ncc, .ci_small4        ! do final bytes if not finished
5169         nop
5170 .ci_smallx:                             ! finish up and exit
5171         membar  #Sync
5172         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5173 .ci_smallqx:
5174         retl
5175         mov     %g0, %o0
5176 
5177 .ci_small4:
5178         cmp     %o2, 4
5179         blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
5180         nop                             !
5181         lda     [%o0]%asi, %o4          ! move 4 bytes
5182         add     %o0, 4, %o0             ! increase src ptr by 4
5183         add     %o1, 4, %o1             ! increase dst ptr by 4
5184         subcc   %o2, 4, %o2             ! decrease count by 4
5185         bz      %ncc, .ci_smallx
5186         stw     %o4, [%o1-4]
5187 
5188 .ci_small3x:                            ! Exactly 1, 2, or 3 bytes remain
5189         subcc   %o2, 1, %o2             ! reduce count for cc test
5190         lduba   [%o0]%asi, %o4          ! load one byte
5191         bz,pt   %ncc, .ci_smallx
5192         stb     %o4, [%o1]              ! store one byte
5193         lduba   [%o0+1]%asi, %o4        ! load second byte
5194         subcc   %o2, 1, %o2
5195         bz,pt   %ncc, .ci_smallx
5196         stb     %o4, [%o1+1]            ! store second byte
5197         lduba   [%o0+2]%asi, %o4        ! load third byte
5198         ba      .ci_smallx
5199         stb     %o4, [%o1+2]            ! store third byte
5200 
5201 .ci_smallest:                           ! 7 or fewer bytes remain
5202         cmp     %o2, 4
5203         blt,pt  %ncc, .ci_small3x
5204         nop
5205         lduba   [%o0]%asi, %o4          ! read byte
5206         subcc   %o2, 4, %o2             ! reduce count by 4
5207         stb     %o4, [%o1]              ! write byte
5208         lduba   [%o0+1]%asi, %o4        ! repeat for total of 4 bytes
5209         add     %o0, 4, %o0             ! advance src by 4
5210         stb     %o4, [%o1+1]
5211         lduba   [%o0-2]%asi, %o4
5212         add     %o1, 4, %o1             ! advance dst by 4
5213         stb     %o4, [%o1-2]
5214         lduba   [%o0-1]%asi, %o4
5215         bnz,pt  %ncc, .ci_small3x
5216         stb     %o4, [%o1-1]
5217         membar  #Sync
5218         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5219         retl
5220         mov     %g0, %o0
5221 
5222 .ci_align:
5223         bnz,pt  %ncc, .ci_al_d1
5224 .ci_al_d1f:                             ! dest is now half word aligned
5225         andcc   %o1, 2, %o3             ! is dest word aligned
5226         bnz,pt  %ncc, .ci_al_d2
5227 .ci_al_d2f:                             ! dest is now word aligned
5228         andcc   %o1, 4, %o3             ! is dest longword aligned?
5229         bz,pt   %ncc, .ci_al_src
5230         nop
5231 .ci_al_d4:                              ! dest is word aligned;  src is unknown
5232         lduba   [%o0]%asi, %o4          ! move a word (src align unknown)
5233         lduba   [%o0+1]%asi, %o3
5234         sll     %o4, 24, %o4            ! position
5235         sll     %o3, 16, %o3            ! position
5236         or      %o4, %o3, %o3           ! merge
5237         lduba   [%o0+2]%asi, %o4
5238         sll     %o4, 8, %o4             ! position
5239         or      %o4, %o3, %o3           ! merge
5240         lduba   [%o0+3]%asi, %o4
5241         or      %o4, %o3, %o4           ! merge
5242         stw     %o4,[%o1]               ! store four bytes
5243         add     %o0, 4, %o0             ! adjust src by 4
5244         add     %o1, 4, %o1             ! adjust dest by 4
5245         sub     %o2, 4, %o2             ! adjust count by 4
5246         andcc   %o0, 7, %o3             ! check for src long word alignment
5247         brz,pt  %o3, .ci_medlong
5248 .ci_src_dst_unal8:
5249         ! dst is 8-byte aligned, src is not
5250         ! Size is less than FP_COPY
5251         ! Following code is to select for alignment
5252         andcc   %o0, 0x3, %o3           ! test word alignment
5253         bz,pt   %ncc, .ci_medword
5254         nop
5255         andcc   %o0, 0x1, %o3           ! test halfword alignment
5256         bnz,pt  %ncc, .ci_med_byte      ! go to byte move if not halfword
5257         andcc   %o0, 0x2, %o3           ! test which byte alignment
5258         ba      .ci_medhalf
5259         nop
5260 .ci_al_d1:                              ! align dest to half word
5261         lduba   [%o0]%asi, %o4          ! move a byte
5262         add     %o0, 1, %o0
5263         stb     %o4, [%o1]
5264         add     %o1, 1, %o1
5265         andcc   %o1, 2, %o3             ! is dest word aligned
5266         bz,pt   %ncc, .ci_al_d2f
5267         sub     %o2, 1, %o2
5268 .ci_al_d2:                              ! align dest to word
5269         lduba   [%o0]%asi, %o4          ! move a half-word (src align unknown)
5270         lduba   [%o0+1]%asi, %o3
5271         sll     %o4, 8, %o4             ! position
5272         or      %o4, %o3, %o4           ! merge
5273         sth     %o4, [%o1]
5274         add     %o0, 2, %o0
5275         add     %o1, 2, %o1
5276         andcc   %o1, 4, %o3             ! is dest longword aligned?
5277         bz,pt   %ncc, .ci_al_src
5278         sub     %o2, 2, %o2
5279         ba      .ci_al_d4
5280         nop
5281 /*
5282  * Handle all cases where src and dest are aligned on word
5283  * boundaries. Use unrolled loops for better performance.
5284  * This option wins over standard large data move when 
5285  * source and destination is in cache for medium
5286  * to short data moves.
5287  */
5288 .ci_medword:
5289         subcc   %o2, 31, %o2            ! adjust length to allow cc test
5290         ble,pt  %ncc, .ci_medw31
5291         nop
5292 .ci_medw32:
5293         lda     [%o0]%asi, %o4          ! move a block of 32 bytes
5294         stw     %o4, [%o1]
5295         lda     [%o0+4]%asi, %o4
5296         stw     %o4, [%o1+4]
5297         lda     [%o0+8]%asi, %o4
5298         stw     %o4, [%o1+8]
5299         lda     [%o0+12]%asi, %o4
5300         stw     %o4, [%o1+12]
5301         lda     [%o0+16]%asi, %o4
5302         stw     %o4, [%o1+16]
5303         lda     [%o0+20]%asi, %o4
5304         subcc   %o2, 32, %o2            ! decrement length count
5305         stw     %o4, [%o1+20]
5306         lda     [%o0+24]%asi, %o4
5307         add     %o0, 32, %o0            ! increase src ptr by 32
5308         stw     %o4, [%o1+24]
5309         lda     [%o0-4]%asi, %o4
5310         add     %o1, 32, %o1            ! increase dst ptr by 32
5311         bgu,pt  %ncc, .ci_medw32        ! repeat if at least 32 bytes left
5312         stw     %o4, [%o1-4]
5313 .ci_medw31:
5314         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5315         ble,pt  %ncc, .ci_medw7         ! skip if 7 or fewer bytes left
5316         nop                             !
5317 .ci_medw15:
5318         lda     [%o0]%asi, %o4          ! move a block of 8 bytes
5319         subcc   %o2, 8, %o2             ! decrement length count
5320         stw     %o4, [%o1]
5321         add     %o0, 8, %o0             ! increase src ptr by 8
5322         lda     [%o0-4]%asi, %o4
5323         add     %o1, 8, %o1             ! increase dst ptr by 8
5324         bgu,pt  %ncc, .ci_medw15
5325         stw     %o4, [%o1-4]
5326 .ci_medw7:
5327         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
5328         bz,pt   %ncc, .ci_smallx        ! exit if finished
5329         cmp     %o2, 4
5330         blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
5331         nop                             !
5332         lda     [%o0]%asi, %o4          ! move 4 bytes
5333         add     %o0, 4, %o0             ! increase src ptr by 4
5334         add     %o1, 4, %o1             ! increase dst ptr by 4
5335         subcc   %o2, 4, %o2             ! decrease count by 4
5336         bnz     .ci_small3x
5337         stw     %o4, [%o1-4]
5338         membar  #Sync
5339         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5340         retl
5341         mov     %g0, %o0
5342 
5343 .ci_medhalf:
5344         subcc   %o2, 31, %o2            ! adjust length to allow cc test
5345         ble,pt  %ncc, .ci_medh31
5346         nop
5347 .ci_medh32:                             ! load and store block of 32 bytes
5348         subcc   %o2, 32, %o2            ! decrement length count
5349 
5350         lduha   [%o0]%asi, %o4          ! move 32 bytes
5351         lduwa   [%o0+2]%asi, %o3
5352         sllx    %o4, 48, %o4
5353         sllx    %o3, 16, %o3
5354         or      %o4, %o3, %o3
5355         lduha   [%o0+6]%asi, %o4
5356         or      %o4, %o3, %o4
5357         stx     %o4, [%o1]
5358 
5359         lduha   [%o0+8]%asi, %o4
5360         lduwa   [%o0+10]%asi, %o3
5361         sllx    %o4, 48, %o4
5362         sllx    %o3, 16, %o3
5363         or      %o4, %o3, %o3
5364         lduha   [%o0+14]%asi, %o4
5365         or      %o4, %o3, %o4
5366         stx     %o4, [%o1+8]
5367 
5368         lduha   [%o0+16]%asi, %o4
5369         lduwa   [%o0+18]%asi, %o3
5370         sllx    %o4, 48, %o4
5371         sllx    %o3, 16, %o3
5372         or      %o4, %o3, %o3
5373         lduha   [%o0+22]%asi, %o4
5374         or      %o4, %o3, %o4
5375         stx     %o4, [%o1+16]
5376 
5377         add     %o0, 32, %o0            ! increase src ptr by 32
5378         add     %o1, 32, %o1            ! increase dst ptr by 32
5379 
5380         lduha   [%o0-8]%asi, %o4
5381         lduwa   [%o0-6]%asi, %o3
5382         sllx    %o4, 48, %o4
5383         sllx    %o3, 16, %o3
5384         or      %o4, %o3, %o3
5385         lduha   [%o0-2]%asi, %o4
5386         or      %o3, %o4, %o4
5387         bgu,pt  %ncc, .ci_medh32        ! repeat if at least 32 bytes left
5388         stx     %o4, [%o1-8]
5389 
5390 .ci_medh31:
5391         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5392         ble,pt  %ncc, .ci_medh7         ! skip if 7 or fewer bytes left
5393         nop                             !
5394 .ci_medh15:
5395         lduha   [%o0]%asi, %o4          ! move 16 bytes
5396         subcc   %o2, 8, %o2             ! decrement length count
5397         lduwa   [%o0+2]%asi, %o3
5398         sllx    %o4, 48, %o4
5399         sllx    %o3, 16, %o3
5400         or      %o4, %o3, %o3
5401         add     %o1, 8, %o1             ! increase dst ptr by 8
5402         lduha   [%o0+6]%asi, %o4
5403         add     %o0, 8, %o0             ! increase src ptr by 8
5404         or      %o4, %o3, %o4
5405         bgu,pt  %ncc, .ci_medh15
5406         stx     %o4, [%o1-8]
5407 .ci_medh7:
5408         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
5409         bz,pt   %ncc, .ci_smallx        ! exit if finished
5410         cmp     %o2, 4
5411         blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
5412         nop                             !
5413         lduha   [%o0]%asi, %o4
5414         sll     %o4, 16, %o4
5415         lduha   [%o0+2]%asi, %o3
5416         or      %o3, %o4, %o4
5417         subcc   %o2, 4, %o2
5418         add     %o0, 4, %o0
5419         add     %o1, 4, %o1
5420         bnz     .ci_small3x
5421         stw     %o4, [%o1-4]
5422         membar  #Sync
5423         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5424         retl
5425         mov     %g0, %o0
5426 
5427         .align 16
5428 .ci_med_byte:
5429         bnz,pt  %ncc, .ci_medbh32a      ! go to correct byte move
5430         subcc   %o2, 31, %o2            ! adjust length to allow cc test
5431         ble,pt  %ncc, .ci_medb31
5432         nop
5433 .ci_medb32:                             ! Alignment 1 or 5
5434         subcc   %o2, 32, %o2            ! decrement length count
5435 
5436         lduba   [%o0]%asi, %o4          ! load and store a block of 32 bytes
5437         sllx    %o4, 56, %o3
5438         lduha   [%o0+1]%asi, %o4
5439         sllx    %o4, 40, %o4
5440         or      %o4, %o3, %o3
5441         lduwa   [%o0+3]%asi, %o4
5442         sllx    %o4, 8, %o4
5443         or      %o4, %o3, %o3
5444         lduba   [%o0+7]%asi, %o4
5445         or      %o4, %o3, %o4
5446         stx     %o4, [%o1]
5447 
5448         lduba   [%o0+8]%asi, %o4
5449         sllx    %o4, 56, %o3
5450         lduha   [%o0+9]%asi, %o4
5451         sllx    %o4, 40, %o4
5452         or      %o4, %o3, %o3
5453         lduwa   [%o0+11]%asi, %o4
5454         sllx    %o4, 8, %o4
5455         or      %o4, %o3, %o3
5456         lduba   [%o0+15]%asi, %o4
5457         or      %o4, %o3, %o4
5458         stx     %o4, [%o1+8]
5459 
5460         lduba   [%o0+16]%asi, %o4
5461         sllx    %o4, 56, %o3
5462         lduha   [%o0+17]%asi, %o4
5463         sllx    %o4, 40, %o4
5464         or      %o4, %o3, %o3
5465         lduwa   [%o0+19]%asi, %o4
5466         sllx    %o4, 8, %o4
5467         or      %o4, %o3, %o3
5468         lduba   [%o0+23]%asi, %o4
5469         or      %o4, %o3, %o4
5470         stx     %o4, [%o1+16]
5471 
5472         add     %o0, 32, %o0            ! increase src ptr by 32
5473         add     %o1, 32, %o1            ! increase dst ptr by 32
5474 
5475         lduba   [%o0-8]%asi, %o4
5476         sllx    %o4, 56, %o3
5477         lduha   [%o0-7]%asi, %o4
5478         sllx    %o4, 40, %o4
5479         or      %o4, %o3, %o3
5480         lduwa   [%o0-5]%asi, %o4
5481         sllx    %o4, 8, %o4
5482         or      %o4, %o3, %o3
5483         lduba   [%o0-1]%asi, %o4
5484         or      %o4, %o3, %o4
5485         bgu,pt  %ncc, .ci_medb32        ! repeat if at least 32 bytes left
5486         stx     %o4, [%o1-8]
5487 
5488 .ci_medb31:                             ! 31 or fewer bytes remaining
5489         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5490         ble,pt  %ncc, .ci_medb7         ! skip if 7 or fewer bytes left
5491         nop                             !
5492 .ci_medb15:
5493 
5494         lduba   [%o0]%asi, %o4          ! load and store a block of 8 bytes
5495         subcc   %o2, 8, %o2             ! decrement length count
5496         sllx    %o4, 56, %o3
5497         lduha   [%o0+1]%asi, %o4
5498         sllx    %o4, 40, %o4
5499         or      %o4, %o3, %o3
5500         lduwa   [%o0+3]%asi, %o4
5501         add     %o1, 8, %o1             ! increase dst ptr by 16
5502         sllx    %o4, 8, %o4
5503         or      %o4, %o3, %o3
5504         lduba   [%o0+7]%asi, %o4
5505         add     %o0, 8, %o0             ! increase src ptr by 16
5506         or      %o4, %o3, %o4
5507         bgu,pt  %ncc, .ci_medb15
5508         stx     %o4, [%o1-8]
5509 .ci_medb7:
5510         addcc   %o2, 7, %o2             ! finish adjustment of remaining count
5511         bz,pt   %ncc, .ci_smallx        ! exit if finished
5512         cmp     %o2, 4
5513         blt,pt  %ncc, .ci_small3x       ! skip if less than 4 bytes left
5514         nop                             !
5515         lduba   [%o0]%asi, %o4          ! move 4 bytes
5516         sll     %o4, 24, %o3
5517         lduha   [%o0+1]%asi, %o4
5518         sll     %o4, 8, %o4
5519         or      %o4, %o3, %o3
5520         lduba   [%o0+3]%asi, %o4
5521         or      %o4, %o3, %o4
5522         subcc   %o2, 4, %o2
5523         add     %o0, 4, %o0
5524         add     %o1, 4, %o1
5525         bnz     .ci_small3x
5526         stw     %o4, [%o1-4]
5527         membar  #Sync
5528         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
5529         retl
5530         mov     %g0, %o0
5531 
5532         .align 16
5533 .ci_medbh32a:                           ! Alignment 3 or 7
5534         ble,pt  %ncc, .ci_medbh31
5535         nop
5536 .ci_medbh32:                            ! Alignment 3 or 7
5537         subcc   %o2, 32, %o2            ! decrement length count
5538 
5539         lduba   [%o0]%asi, %o4          ! load and store a block of 32 bytes
5540         sllx    %o4, 56, %o3
5541         lduwa   [%o0+1]%asi, %o4
5542         sllx    %o4, 24, %o4
5543         or      %o4, %o3, %o3
5544         lduha   [%o0+5]%asi, %o4
5545         sllx    %o4, 8, %o4
5546         or      %o4, %o3, %o3
5547         lduba   [%o0+7]%asi, %o4
5548         or      %o4, %o3, %o4
5549         stx     %o4, [%o1]
5550 
5551         lduba   [%o0+8]%asi, %o4
5552         sllx    %o4, 56, %o3
5553         lduwa   [%o0+9]%asi, %o4
5554         sllx    %o4, 24, %o4
5555         or      %o4, %o3, %o3
5556         lduha   [%o0+13]%asi, %o4
5557         sllx    %o4, 8, %o4
5558         or      %o4, %o3, %o3
5559         lduba   [%o0+15]%asi, %o4
5560         or      %o4, %o3, %o4
5561         stx     %o4, [%o1+8]
5562 
5563         lduba   [%o0+16]%asi, %o4
5564         sllx    %o4, 56, %o3
5565         lduwa   [%o0+17]%asi, %o4
5566         sllx    %o4, 24, %o4
5567         or      %o4, %o3, %o3
5568         lduha   [%o0+21]%asi, %o4
5569         sllx    %o4, 8, %o4
5570         or      %o4, %o3, %o3
5571         lduba   [%o0+23]%asi, %o4
5572         or      %o4, %o3, %o4
5573         stx     %o4, [%o1+16]
5574 
5575         add     %o0, 32, %o0            ! increase src ptr by 32
5576         add     %o1, 32, %o1            ! increase dst ptr by 32
5577 
5578         lduba   [%o0-8]%asi, %o4
5579         sllx    %o4, 56, %o3
5580         lduwa   [%o0-7]%asi, %o4
5581         sllx    %o4, 24, %o4
5582         or      %o4, %o3, %o3
5583         lduha   [%o0-3]%asi, %o4
5584         sllx    %o4, 8, %o4
5585         or      %o4, %o3, %o3
5586         lduba   [%o0-1]%asi, %o4
5587         or      %o4, %o3, %o4
5588         bgu,pt  %ncc, .ci_medbh32       ! repeat if at least 32 bytes left
5589         stx     %o4, [%o1-8]
5590 
5591 .ci_medbh31:
5592         addcc   %o2, 24, %o2            ! adjust count to be off by 7
5593         ble,pt  %ncc, .ci_medb7         ! skip if 7 or fewer bytes left
5594         nop                             !
5595 .ci_medbh15:
5596         lduba   [%o0]%asi, %o4          ! load and store a block of 8 bytes
5597         sllx    %o4, 56, %o3
5598         lduwa   [%o0+1]%asi, %o4
5599         sllx    %o4, 24, %o4
5600         or      %o4, %o3, %o3
5601         lduha   [%o0+5]%asi, %o4
5602         sllx    %o4, 8, %o4
5603         or      %o4, %o3, %o3
5604         lduba   [%o0+7]%asi, %o4
5605         or      %o4, %o3, %o4
5606         stx     %o4, [%o1]
5607         subcc   %o2, 8, %o2             ! decrement length count
5608         add     %o1, 8, %o1             ! increase dst ptr by 8
5609         add     %o0, 8, %o0             ! increase src ptr by 8
5610         bgu,pt  %ncc, .ci_medbh15
5611         stx     %o4, [%o1-8]
5612         ba      .ci_medb7
5613         nop
5614 
5615 /*
5616  * End of small copy in code (no window)
5617  * 
5618  */
5619 
5620 /*
5621  * Long copy in code (using register window and fp regs)
5622  * 
5623  */
5624 
5625 .ci_copy_more:
5626         sethi   %hi(copyio_fault), %o3
5627         or      %o3, %lo(copyio_fault), %o3
5628         membar  #Sync
5629         stn     %o3, [THREAD_REG + T_LOFAULT]
5630 /*
5631  * Following code is for large copies. We know there is at
5632  * least FP_COPY bytes available. FP regs are used, so
5633  *  we save registers and fp regs before starting
5634  */
5635         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
5636         or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
5637         rd      %fprs, %g1              ! check for unused fp
5638         ! if fprs.fef == 0, set it.
5639         ! Setting it when already set costs more than checking
5640         andcc   %g1, FPRS_FEF, %g1      ! test FEF, fprs.du = fprs.dl = 0
5641         bz,pt   %ncc, .ci_fp_unused
5642         mov     ASI_USER, %asi
5643         BST_FP_TOSTACK(%o3)
5644         ba      .ci_fp_ready
5645 .ci_fp_unused:
5646         prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read
5647         wr      %g0, FPRS_FEF, %fprs    ! fprs.fef = 1
5648 .ci_fp_ready:
5649         rd      %gsr, %l5               ! save %gsr value
5650         andcc   %i1, 1, %o3             ! is dest byte aligned
5651         bnz,pt  %ncc, .ci_big_d1
5652 .ci_big_d1f:                            ! dest is now half word aligned
5653         andcc   %i1, 2, %o3
5654         bnz,pt  %ncc, .ci_big_d2
5655 .ci_big_d2f:                            ! dest is now word aligned
5656         andcc   %i1, 4, %o3
5657         bnz,pt  %ncc, .ci_big_d4
5658 .ci_big_d4f:                            ! dest is long word aligned
5659         andcc   %i0, 7, %o3             ! is src long word aligned
5660         brnz,pt %o3, .ci_big_unal8
5661         prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read
5662         ! Src and dst are long word aligned
5663         ! align dst to 64 byte boundary
5664         andcc   %i1, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
5665         brz,pn  %o3, .ci_al_to_64
5666         nop
5667         sub     %o3, 64, %o3            ! %o3 has negative bytes to move
5668         add     %i2, %o3, %i2           ! adjust remaining count
5669         andcc   %o3, 8, %o4             ! odd long words to move?
5670         brz,pt  %o4, .ci_al_to_16
5671         nop
5672         add     %o3, 8, %o3
5673         ldxa    [%i0]%asi, %o4
5674         add     %i0, 8, %i0             ! increment src ptr
5675         add     %i1, 8, %i1             ! increment dst ptr
5676         stx     %o4, [%i1-8]
5677 ! Dest is aligned on 16 bytes, src 8 byte aligned
5678 .ci_al_to_16:
5679         andcc   %o3, 0x30, %o4          ! pair of long words to move?
5680         brz,pt  %o4, .ci_al_to_64
5681         nop
5682 .ci_al_mv_16:
5683         add     %o3, 16, %o3
5684         ldxa    [%i0]%asi, %o4
5685         stx     %o4, [%i1]
5686         add     %i0, 16, %i0            ! increment src ptr
5687         ldxa    [%i0-8]%asi, %o4
5688         stx     %o4, [%i1+8]
5689         andcc   %o3, 0x30, %o4
5690         brnz,pt %o4, .ci_al_mv_16
5691         add     %i1, 16, %i1            ! increment dst ptr
5692 ! Dest is aligned on 64 bytes, src 8 byte aligned
5693 .ci_al_to_64:
5694         ! Determine source alignment
5695         ! to correct 8 byte offset
5696         andcc   %i0, 32, %o3
5697         brnz,pn %o3, .ci_aln_1
5698         andcc   %i0, 16, %o3
5699         brnz,pn %o3, .ci_aln_01
5700         andcc   %i0, 8, %o3
5701         brz,pn  %o3, .ci_aln_000
5702         prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5703         ba      .ci_aln_001
5704         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5705 .ci_aln_01:
5706         brnz,pn %o3, .ci_aln_011
5707         prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5708         ba      .ci_aln_010
5709         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5710 .ci_aln_1:
5711         andcc   %i0, 16, %o3
5712         brnz,pn %o3, .ci_aln_11
5713         andcc   %i0, 8, %o3
5714         brnz,pn %o3, .ci_aln_101
5715         prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5716         ba      .ci_aln_100
5717         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5718 .ci_aln_11:
5719         brz,pn  %o3, .ci_aln_110
5720         prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read
5721 
5722 .ci_aln_111:
5723 ! Alignment off by 8 bytes
5724         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5725         ldda    [%i0]%asi, %d0
5726         add     %i0, 8, %i0
5727         sub     %i2, 8, %i2
5728         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5729         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5730         sub     %i1, %i0, %i1
5731 .ci_aln_111_loop:
5732         ldda    [%i0]ASI_BLK_AIUS,%d16          ! block load
5733         subcc   %o3, 64, %o3
5734         fmovd   %d16, %d2
5735         fmovd   %d18, %d4
5736         fmovd   %d20, %d6
5737         fmovd   %d22, %d8
5738         fmovd   %d24, %d10
5739         fmovd   %d26, %d12
5740         fmovd   %d28, %d14
5741         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5742         stda    %d0,[%i0+%i1]ASI_BLK_P
5743         add     %i0, 64, %i0
5744         fmovd   %d30, %d0
5745         bgt,pt  %ncc, .ci_aln_111_loop
5746         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5747         add     %i1, %i0, %i1
5748 
5749         std     %d0, [%i1]
5750         ba      .ci_remain_stuff
5751         add     %i1, 8, %i1
5752         ! END OF aln_111
5753 
5754 .ci_aln_110:
5755 ! Alignment off by 16 bytes
5756         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5757         ldda    [%i0]%asi, %d0
5758         ldda    [%i0+8]%asi, %d2
5759         add     %i0, 16, %i0
5760         sub     %i2, 16, %i2
5761         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5762         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5763         sub     %i1, %i0, %i1
5764 .ci_aln_110_loop:
5765         ldda    [%i0]ASI_BLK_AIUS,%d16          ! block load
5766         subcc   %o3, 64, %o3
5767         fmovd   %d16, %d4
5768         fmovd   %d18, %d6
5769         fmovd   %d20, %d8
5770         fmovd   %d22, %d10
5771         fmovd   %d24, %d12
5772         fmovd   %d26, %d14
5773         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5774         stda    %d0,[%i0+%i1]ASI_BLK_P
5775         add     %i0, 64, %i0
5776         fmovd   %d28, %d0
5777         fmovd   %d30, %d2
5778         bgt,pt  %ncc, .ci_aln_110_loop
5779         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5780         add     %i1, %i0, %i1
5781 
5782         std     %d0, [%i1]
5783         std     %d2, [%i1+8]
5784         ba      .ci_remain_stuff
5785         add     %i1, 16, %i1
5786         ! END OF aln_110
5787 
5788 .ci_aln_101:
5789 ! Alignment off by 24 bytes
5790         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5791         ldda    [%i0]%asi, %d0
5792         ldda    [%i0+8]%asi, %d2
5793         ldda    [%i0+16]%asi, %d4
5794         add     %i0, 24, %i0
5795         sub     %i2, 24, %i2
5796         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5797         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5798         sub     %i1, %i0, %i1
5799 .ci_aln_101_loop:
5800         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
5801         subcc   %o3, 64, %o3
5802         fmovd   %d16, %d6
5803         fmovd   %d18, %d8
5804         fmovd   %d20, %d10
5805         fmovd   %d22, %d12
5806         fmovd   %d24, %d14
5807         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5808         stda    %d0,[%i0+%i1]ASI_BLK_P
5809         add     %i0, 64, %i0
5810         fmovd   %d26, %d0
5811         fmovd   %d28, %d2
5812         fmovd   %d30, %d4
5813         bgt,pt  %ncc, .ci_aln_101_loop
5814         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5815         add     %i1, %i0, %i1
5816 
5817         std     %d0, [%i1]
5818         std     %d2, [%i1+8]
5819         std     %d4, [%i1+16]
5820         ba      .ci_remain_stuff
5821         add     %i1, 24, %i1
5822         ! END OF aln_101
5823 
5824 .ci_aln_100:
5825 ! Alignment off by 32 bytes
5826         ldda    [%i0]%asi, %d0
5827         ldda    [%i0+8]%asi, %d2
5828         ldda    [%i0+16]%asi,%d4
5829         ldda    [%i0+24]%asi,%d6
5830         add     %i0, 32, %i0
5831         sub     %i2, 32, %i2
5832         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5833         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5834         sub     %i1, %i0, %i1
5835 .ci_aln_100_loop:
5836         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
5837         subcc   %o3, 64, %o3
5838         fmovd   %d16, %d8
5839         fmovd   %d18, %d10
5840         fmovd   %d20, %d12
5841         fmovd   %d22, %d14
5842         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5843         stda    %d0,[%i0+%i1]ASI_BLK_P
5844         add     %i0, 64, %i0
5845         fmovd   %d24, %d0
5846         fmovd   %d26, %d2
5847         fmovd   %d28, %d4
5848         fmovd   %d30, %d6
5849         bgt,pt  %ncc, .ci_aln_100_loop
5850         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5851         add     %i1, %i0, %i1
5852 
5853         std     %d0, [%i1]
5854         std     %d2, [%i1+8]
5855         std     %d4, [%i1+16]
5856         std     %d6, [%i1+24]
5857         ba      .ci_remain_stuff
5858         add     %i1, 32, %i1
5859         ! END OF aln_100
5860 
5861 .ci_aln_011:
5862 ! Alignment off by 40 bytes
5863         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5864         ldda    [%i0]%asi, %d0
5865         ldda    [%i0+8]%asi, %d2
5866         ldda    [%i0+16]%asi, %d4
5867         ldda    [%i0+24]%asi, %d6
5868         ldda    [%i0+32]%asi, %d8
5869         add     %i0, 40, %i0
5870         sub     %i2, 40, %i2
5871         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5872         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5873         sub     %i1, %i0, %i1
5874 .ci_aln_011_loop:
5875         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
5876         subcc   %o3, 64, %o3
5877         fmovd   %d16, %d10
5878         fmovd   %d18, %d12
5879         fmovd   %d20, %d14
5880         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5881         stda    %d0,[%i0+%i1]ASI_BLK_P
5882         add     %i0, 64, %i0
5883         fmovd   %d22, %d0
5884         fmovd   %d24, %d2
5885         fmovd   %d26, %d4
5886         fmovd   %d28, %d6
5887         fmovd   %d30, %d8
5888         bgt,pt  %ncc, .ci_aln_011_loop
5889         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5890         add     %i1, %i0, %i1
5891 
5892         std     %d0, [%i1]
5893         std     %d2, [%i1+8]
5894         std     %d4, [%i1+16]
5895         std     %d6, [%i1+24]
5896         std     %d8, [%i1+32]
5897         ba      .ci_remain_stuff
5898         add     %i1, 40, %i1
5899         ! END OF aln_011
5900 
5901 .ci_aln_010:
5902 ! Alignment off by 48 bytes
5903         ldda    [%i0]%asi, %d0
5904         ldda    [%i0+8]%asi, %d2
5905         ldda    [%i0+16]%asi, %d4
5906         ldda    [%i0+24]%asi, %d6
5907         ldda    [%i0+32]%asi, %d8
5908         ldda    [%i0+40]%asi, %d10
5909         add     %i0, 48, %i0
5910         sub     %i2, 48, %i2
5911         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5912         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5913         sub     %i1, %i0, %i1
5914 .ci_aln_010_loop:
5915         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
5916         subcc   %o3, 64, %o3
5917         fmovd   %d16, %d12
5918         fmovd   %d18, %d14
5919         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5920         stda    %d0,[%i0+%i1]ASI_BLK_P
5921         add     %i0, 64, %i0
5922         fmovd   %d20, %d0
5923         fmovd   %d22, %d2
5924         fmovd   %d24, %d4
5925         fmovd   %d26, %d6
5926         fmovd   %d28, %d8
5927         fmovd   %d30, %d10
5928         bgt,pt  %ncc, .ci_aln_010_loop
5929         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5930         add     %i1, %i0, %i1
5931 
5932         std     %d0, [%i1]
5933         std     %d2, [%i1+8]
5934         std     %d4, [%i1+16]
5935         std     %d6, [%i1+24]
5936         std     %d8, [%i1+32]
5937         std     %d10, [%i1+40]
5938         ba      .ci_remain_stuff
5939         add     %i1, 48, %i1
5940         ! END OF aln_010
5941 
5942 .ci_aln_001:
5943 ! Alignment off by 56 bytes
5944         ldda    [%i0]%asi, %d0
5945         ldda    [%i0+8]%asi, %d2
5946         ldda    [%i0+16]%asi, %d4
5947         ldda    [%i0+24]%asi, %d6
5948         ldda    [%i0+32]%asi, %d8
5949         ldda    [%i0+40]%asi, %d10
5950         ldda    [%i0+48]%asi, %d12
5951         add     %i0, 56, %i0
5952         sub     %i2, 56, %i2
5953         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5954         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5955         sub     %i1, %i0, %i1
5956 .ci_aln_001_loop:
5957         ldda    [%i0]ASI_BLK_AIUS,%d16  ! block load
5958         subcc   %o3, 64, %o3
5959         fmovd   %d16, %d14
5960         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5961         stda    %d0,[%i0+%i1]ASI_BLK_P
5962         add     %i0, 64, %i0
5963         fmovd   %d18, %d0
5964         fmovd   %d20, %d2
5965         fmovd   %d22, %d4
5966         fmovd   %d24, %d6
5967         fmovd   %d26, %d8
5968         fmovd   %d28, %d10
5969         fmovd   %d30, %d12
5970         bgt,pt  %ncc, .ci_aln_001_loop
5971         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5972         add     %i1, %i0, %i1
5973 
5974         std     %d0, [%i1]
5975         std     %d2, [%i1+8]
5976         std     %d4, [%i1+16]
5977         std     %d6, [%i1+24]
5978         std     %d8, [%i1+32]
5979         std     %d10, [%i1+40]
5980         std     %d12, [%i1+48]
5981         ba      .ci_remain_stuff
5982         add     %i1, 56, %i1
5983         ! END OF aln_001
5984 
5985 .ci_aln_000:
5986         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5987         andn    %i2, 0x7f, %o3          ! %o3 is multiple of 2*block size
5988         and     %i2, 0x7f, %i2          ! residue bytes in %i2
5989         sub     %i1, %i0, %i1
5990 .ci_aln_000_loop:
5991         ldda    [%i0]ASI_BLK_AIUS,%d0
5992         subcc   %o3, 64, %o3
5993         stxa    %g0,[%i0+%i1]ASI_STBI_P ! block initializing store
5994         stda    %d0,[%i0+%i1]ASI_BLK_P
5995         add     %i0, 64, %i0
5996         bgt,pt  %ncc, .ci_aln_000_loop
5997         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
5998         add     %i1, %i0, %i1
5999 
6000         ! END OF aln_000
6001 
6002 .ci_remain_stuff:
6003         subcc   %i2, 31, %i2            ! adjust length to allow cc test
6004         ble,pt  %ncc, .ci_aln_31
6005         nop
6006 .ci_aln_32:
6007         ldxa    [%i0]%asi, %o4          ! move 32 bytes
6008         subcc   %i2, 32, %i2            ! decrement length count by 32
6009         stx     %o4, [%i1]
6010         ldxa    [%i0+8]%asi, %o4
6011         stx     %o4, [%i1+8]
6012         ldxa    [%i0+16]%asi, %o4
6013         add     %i0, 32, %i0            ! increase src ptr by 32
6014         stx     %o4, [%i1+16]
6015         ldxa    [%i0-8]%asi, %o4
6016         add     %i1, 32, %i1            ! increase dst ptr by 32
6017         bgu,pt  %ncc, .ci_aln_32        ! repeat if at least 32 bytes left
6018         stx     %o4, [%i1-8]
6019 .ci_aln_31:
6020         addcc   %i2, 24, %i2            ! adjust count to be off by 7
6021         ble,pt  %ncc, .ci_aln_7         ! skip if 7 or fewer bytes left
6022         nop                             !
6023 .ci_aln_15:
6024         ldxa    [%i0]%asi, %o4          ! move 8 bytes
6025         add     %i0, 8, %i0             ! increase src ptr by 8
6026         subcc   %i2, 8, %i2             ! decrease count by 8
6027         add     %i1, 8, %i1             ! increase dst ptr by 8
6028         bgu,pt  %ncc, .ci_aln_15
6029         stx     %o4, [%i1-8]            !
6030 .ci_aln_7:
6031         addcc   %i2, 7, %i2             ! finish adjustment of remaining count
6032         bz,pt   %ncc, .ci_exit          ! exit if finished
6033         cmp     %i2, 4
6034         blt,pt  %ncc, .ci_unaln3x       ! skip if less than 4 bytes left
6035         nop                             !
6036         lda     [%i0]%asi, %o4          ! move 4 bytes
6037         add     %i0, 4, %i0             ! increase src ptr by 4
6038         add     %i1, 4, %i1             ! increase dst ptr by 4
6039         subcc   %i2, 4, %i2             ! decrease count by 4
6040         bnz     .ci_unaln3x
6041         stw     %o4, [%i1-4]
6042         ba      .ci_exit
6043         nop
6044 
6045         ! destination alignment code
6046 .ci_big_d1:
6047         lduba   [%i0]%asi, %o4          ! move a byte
6048         add     %i0, 1, %i0
6049         stb     %o4, [%i1]
6050         add     %i1, 1, %i1
6051         andcc   %i1, 2, %o3
6052         bz,pt   %ncc, .ci_big_d2f
6053         sub     %i2, 1, %i2
6054 .ci_big_d2:                             ! dest is now at least half word aligned
6055         lduba   [%i0]%asi, %o4          ! move a half-word (src align unknown)
6056         lduba   [%i0+1]%asi, %o3
6057         add     %i0, 2, %i0
6058         sll     %o4, 8, %o4             ! position
6059         or      %o4, %o3, %o4           ! merge
6060         sth     %o4, [%i1]
6061         add     %i1, 2, %i1
6062         andcc   %i1, 4, %o3
6063         bz,pt   %ncc, .ci_big_d4f
6064         sub     %i2, 2, %i2
6065 .ci_big_d4:                             ! dest is at least word aligned
6066         nop
6067         lduba   [%i0]%asi, %o4          ! move a word (src align unknown)
6068         lduba   [%i0+1]%asi, %o3
6069         sll     %o4, 24, %o4            ! position
6070         sll     %o3, 16, %o3            ! position
6071         or      %o4, %o3, %o3           ! merge
6072         lduba   [%i0+2]%asi, %o4
6073         sll     %o4, 8, %o4             ! position
6074         or      %o4, %o3, %o3           ! merge
6075         lduba   [%i0+3]%asi, %o4
6076         or      %o4, %o3, %o4           ! merge
6077         stw     %o4,[%i1]               ! store four bytes
6078         add     %i0, 4, %i0             ! adjust src by 4
6079         add     %i1, 4, %i1             ! adjust dest by 4
6080         ba      .ci_big_d4f
6081         sub     %i2, 4, %i2             ! adjust count by 4
6082 
6083 
6084         ! Dst is on 8 byte boundary; src is not;
6085 .ci_big_unal8:
6086         andcc   %i1, 0x3f, %o3          ! is dst 64-byte block aligned?
6087         bz      %ncc, .ci_unalnsrc
6088         sub     %o3, 64, %o3            ! %o3 will be multiple of 8
6089         neg     %o3                     ! bytes until dest is 64 byte aligned
6090         sub     %i2, %o3, %i2           ! update cnt with bytes to be moved
6091         ! Move bytes according to source alignment
6092         andcc   %i0, 0x1, %o4
6093         bnz     %ncc, .ci_unalnbyte     ! check for byte alignment
6094         nop
6095         andcc   %i0, 2, %o4             ! check for half word alignment
6096         bnz     %ncc, .ci_unalnhalf
6097         nop
6098         ! Src is word aligned, move bytes until dest 64 byte aligned
6099 .ci_unalnword:
6100         lda     [%i0]%asi, %o4          ! load 4 bytes
6101         stw     %o4, [%i1]              ! and store 4 bytes
6102         lda     [%i0+4]%asi, %o4        ! load 4 bytes
6103         add     %i0, 8, %i0             ! increase src ptr by 8
6104         stw     %o4, [%i1+4]            ! and store 4 bytes
6105         subcc   %o3, 8, %o3             ! decrease count by 8
6106         bnz     %ncc, .ci_unalnword
6107         add     %i1, 8, %i1             ! increase dst ptr by 8
6108         ba      .ci_unalnsrc
6109         nop
6110 
6111         ! Src is half-word aligned, move bytes until dest 64 byte aligned
6112 .ci_unalnhalf:
6113         lduha   [%i0]%asi, %o4          ! load 2 bytes
6114         sllx    %o4, 32, %i3            ! shift left
6115         lduwa   [%i0+2]%asi, %o4
6116         or      %o4, %i3, %i3
6117         sllx    %i3, 16, %i3
6118         lduha   [%i0+6]%asi, %o4
6119         or      %o4, %i3, %i3
6120         stx     %i3, [%i1]
6121         add     %i0, 8, %i0
6122         subcc   %o3, 8, %o3
6123         bnz     %ncc, .ci_unalnhalf
6124         add     %i1, 8, %i1
6125         ba      .ci_unalnsrc
6126         nop
6127 
6128         ! Src is Byte aligned, move bytes until dest 64 byte aligned
6129 .ci_unalnbyte:
6130         sub     %i1, %i0, %i1           ! share pointer advance
6131 .ci_unalnbyte_loop:
6132         lduba   [%i0]%asi, %o4
6133         sllx    %o4, 56, %i3
6134         lduha   [%i0+1]%asi, %o4
6135         sllx    %o4, 40, %o4
6136         or      %o4, %i3, %i3
6137         lduha   [%i0+3]%asi, %o4
6138         sllx    %o4, 24, %o4
6139         or      %o4, %i3, %i3
6140         lduha   [%i0+5]%asi, %o4
6141         sllx    %o4, 8, %o4
6142         or      %o4, %i3, %i3
6143         lduba   [%i0+7]%asi, %o4
6144         or      %o4, %i3, %i3
6145         stx     %i3, [%i1+%i0]
6146         subcc   %o3, 8, %o3
6147         bnz     %ncc, .ci_unalnbyte_loop
6148         add     %i0, 8, %i0
6149         add     %i1,%i0, %i1            ! restore pointer
6150 
6151         ! Destination is now block (64 byte aligned), src is not 8 byte aligned
6152 .ci_unalnsrc:
6153         andn    %i2, 0x3f, %i3          ! %i3 is multiple of block size
6154         and     %i2, 0x3f, %i2          ! residue bytes in %i2
6155         add     %i2, 64, %i2            ! Insure we don't load beyond
6156         sub     %i3, 64, %i3            ! end of source buffer
6157 
6158         andn    %i0, 0x3f, %o4          ! %o4 has block aligned src address
6159         prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read
6160         alignaddr %i0, %g0, %g0         ! generate %gsr
6161         add     %i0, %i3, %i0           ! advance %i0 to after blocks
6162         !
6163         ! Determine source alignment to correct 8 byte offset
6164         andcc   %i0, 0x20, %o3
6165         brnz,pn %o3, .ci_unaln_1
6166         andcc   %i0, 0x10, %o3
6167         brnz,pn %o3, .ci_unaln_01
6168         andcc   %i0, 0x08, %o3
6169         brz,a   %o3, .ci_unaln_000
6170         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6171         ba      .ci_unaln_001
6172         nop
6173 .ci_unaln_01:
6174         brnz,a  %o3, .ci_unaln_011
6175         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6176         ba      .ci_unaln_010
6177         nop
6178 .ci_unaln_1:
6179         brnz,pn %o3, .ci_unaln_11
6180         andcc   %i0, 0x08, %o3
6181         brnz,a  %o3, .ci_unaln_101
6182         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6183         ba      .ci_unaln_100
6184         nop
6185 .ci_unaln_11:
6186         brz,pn  %o3, .ci_unaln_110
6187         prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read
6188 
6189 .ci_unaln_111:
6190         ldda    [%o4+56]%asi, %d14
6191 .ci_unaln_111_loop:
6192         add     %o4, 64, %o4
6193         ldda    [%o4]ASI_BLK_AIUS, %d16
6194         faligndata %d14, %d16, %d48
6195         faligndata %d16, %d18, %d50
6196         faligndata %d18, %d20, %d52
6197         faligndata %d20, %d22, %d54
6198         faligndata %d22, %d24, %d56
6199         faligndata %d24, %d26, %d58
6200         faligndata %d26, %d28, %d60
6201         faligndata %d28, %d30, %d62
6202         fmovd   %d30, %d14
6203         stda    %d48, [%i1]ASI_BLK_P
6204         subcc   %i3, 64, %i3
6205         add     %i1, 64, %i1
6206         bgu,pt  %ncc, .ci_unaln_111_loop
6207         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6208         ba      .ci_unaln_done
6209         nop
6210 
6211 .ci_unaln_110:
6212         ldda    [%o4+48]%asi, %d12
6213         ldda    [%o4+56]%asi, %d14
6214 .ci_unaln_110_loop:
6215         add     %o4, 64, %o4
6216         ldda    [%o4]ASI_BLK_AIUS, %d16
6217         faligndata %d12, %d14, %d48
6218         faligndata %d14, %d16, %d50
6219         faligndata %d16, %d18, %d52
6220         faligndata %d18, %d20, %d54
6221         faligndata %d20, %d22, %d56
6222         faligndata %d22, %d24, %d58
6223         faligndata %d24, %d26, %d60
6224         faligndata %d26, %d28, %d62
6225         fmovd   %d28, %d12
6226         fmovd   %d30, %d14
6227         stda    %d48, [%i1]ASI_BLK_P
6228         subcc   %i3, 64, %i3
6229         add     %i1, 64, %i1
6230         bgu,pt  %ncc, .ci_unaln_110_loop
6231         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6232         ba      .ci_unaln_done
6233         nop
6234 
6235 .ci_unaln_101:
6236         ldda    [%o4+40]%asi, %d10
6237         ldda    [%o4+48]%asi, %d12
6238         ldda    [%o4+56]%asi, %d14
6239 .ci_unaln_101_loop:
6240         add     %o4, 64, %o4
6241         ldda    [%o4]ASI_BLK_AIUS, %d16
6242         faligndata %d10, %d12, %d48
6243         faligndata %d12, %d14, %d50
6244         faligndata %d14, %d16, %d52
6245         faligndata %d16, %d18, %d54
6246         faligndata %d18, %d20, %d56
6247         faligndata %d20, %d22, %d58
6248         faligndata %d22, %d24, %d60
6249         faligndata %d24, %d26, %d62
6250         fmovd   %d26, %d10
6251         fmovd   %d28, %d12
6252         fmovd   %d30, %d14
6253         stda    %d48, [%i1]ASI_BLK_P
6254         subcc   %i3, 64, %i3
6255         add     %i1, 64, %i1
6256         bgu,pt  %ncc, .ci_unaln_101_loop
6257         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6258         ba      .ci_unaln_done
6259         nop
6260 
6261 .ci_unaln_100:
6262         ldda    [%o4+32]%asi, %d8
6263         ldda    [%o4+40]%asi, %d10
6264         ldda    [%o4+48]%asi, %d12
6265         ldda    [%o4+56]%asi, %d14
6266 .ci_unaln_100_loop:
6267         add     %o4, 64, %o4
6268         ldda    [%o4]ASI_BLK_AIUS, %d16
6269         faligndata %d8, %d10, %d48
6270         faligndata %d10, %d12, %d50
6271         faligndata %d12, %d14, %d52
6272         faligndata %d14, %d16, %d54
6273         faligndata %d16, %d18, %d56
6274         faligndata %d18, %d20, %d58
6275         faligndata %d20, %d22, %d60
6276         faligndata %d22, %d24, %d62
6277         fmovd   %d24, %d8
6278         fmovd   %d26, %d10
6279         fmovd   %d28, %d12
6280         fmovd   %d30, %d14
6281         stda    %d48, [%i1]ASI_BLK_P
6282         subcc   %i3, 64, %i3
6283         add     %i1, 64, %i1
6284         bgu,pt  %ncc, .ci_unaln_100_loop
6285         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6286         ba      .ci_unaln_done
6287         nop
6288 
6289 .ci_unaln_011:
6290         ldda    [%o4+24]%asi, %d6
6291         ldda    [%o4+32]%asi, %d8
6292         ldda    [%o4+40]%asi, %d10
6293         ldda    [%o4+48]%asi, %d12
6294         ldda    [%o4+56]%asi, %d14
6295 .ci_unaln_011_loop:
6296         add     %o4, 64, %o4
6297         ldda    [%o4]ASI_BLK_AIUS, %d16
6298         faligndata %d6, %d8, %d48
6299         faligndata %d8, %d10, %d50
6300         faligndata %d10, %d12, %d52
6301         faligndata %d12, %d14, %d54
6302         faligndata %d14, %d16, %d56
6303         faligndata %d16, %d18, %d58
6304         faligndata %d18, %d20, %d60
6305         faligndata %d20, %d22, %d62
6306         fmovd   %d22, %d6
6307         fmovd   %d24, %d8
6308         fmovd   %d26, %d10
6309         fmovd   %d28, %d12
6310         fmovd   %d30, %d14
6311         stda    %d48, [%i1]ASI_BLK_P
6312         subcc   %i3, 64, %i3
6313         add     %i1, 64, %i1
6314         bgu,pt  %ncc, .ci_unaln_011_loop
6315         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6316         ba      .ci_unaln_done
6317         nop
6318 
6319 .ci_unaln_010:
6320         ldda    [%o4+16]%asi, %d4
6321         ldda    [%o4+24]%asi, %d6
6322         ldda    [%o4+32]%asi, %d8
6323         ldda    [%o4+40]%asi, %d10
6324         ldda    [%o4+48]%asi, %d12
6325         ldda    [%o4+56]%asi, %d14
6326 .ci_unaln_010_loop:
6327         add     %o4, 64, %o4
6328         ldda    [%o4]ASI_BLK_AIUS, %d16
6329         faligndata %d4, %d6, %d48
6330         faligndata %d6, %d8, %d50
6331         faligndata %d8, %d10, %d52
6332         faligndata %d10, %d12, %d54
6333         faligndata %d12, %d14, %d56
6334         faligndata %d14, %d16, %d58
6335         faligndata %d16, %d18, %d60
6336         faligndata %d18, %d20, %d62
6337         fmovd   %d20, %d4
6338         fmovd   %d22, %d6
6339         fmovd   %d24, %d8
6340         fmovd   %d26, %d10
6341         fmovd   %d28, %d12
6342         fmovd   %d30, %d14
6343         stda    %d48, [%i1]ASI_BLK_P
6344         subcc   %i3, 64, %i3
6345         add     %i1, 64, %i1
6346         bgu,pt  %ncc, .ci_unaln_010_loop
6347         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6348         ba      .ci_unaln_done
6349         nop
6350 
6351 .ci_unaln_001:
6352         ldda    [%o4+8]%asi, %d2
6353         ldda    [%o4+16]%asi, %d4
6354         ldda    [%o4+24]%asi, %d6
6355         ldda    [%o4+32]%asi, %d8
6356         ldda    [%o4+40]%asi, %d10
6357         ldda    [%o4+48]%asi, %d12
6358         ldda    [%o4+56]%asi, %d14
6359 .ci_unaln_001_loop:
6360         add     %o4, 64, %o4
6361         ldda    [%o4]ASI_BLK_AIUS, %d16
6362         faligndata %d2, %d4, %d48
6363         faligndata %d4, %d6, %d50
6364         faligndata %d6, %d8, %d52
6365         faligndata %d8, %d10, %d54
6366         faligndata %d10, %d12, %d56
6367         faligndata %d12, %d14, %d58
6368         faligndata %d14, %d16, %d60
6369         faligndata %d16, %d18, %d62
6370         fmovd   %d18, %d2
6371         fmovd   %d20, %d4
6372         fmovd   %d22, %d6
6373         fmovd   %d24, %d8
6374         fmovd   %d26, %d10
6375         fmovd   %d28, %d12
6376         fmovd   %d30, %d14
6377         stda    %d48, [%i1]ASI_BLK_P
6378         subcc   %i3, 64, %i3
6379         add     %i1, 64, %i1
6380         bgu,pt  %ncc, .ci_unaln_001_loop
6381         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6382         ba      .ci_unaln_done
6383         nop
6384 
6385 .ci_unaln_000:
6386         ldda    [%o4]ASI_BLK_AIUS, %d0
6387 .ci_unaln_000_loop:
6388         add     %o4, 64, %o4
6389         ldda    [%o4]ASI_BLK_AIUS, %d16
6390         faligndata %d0, %d2, %d48
6391         faligndata %d2, %d4, %d50
6392         faligndata %d4, %d6, %d52
6393         faligndata %d6, %d8, %d54
6394         faligndata %d8, %d10, %d56
6395         faligndata %d10, %d12, %d58
6396         faligndata %d12, %d14, %d60
6397         faligndata %d14, %d16, %d62
6398         fmovd   %d16, %d0
6399         fmovd   %d18, %d2
6400         fmovd   %d20, %d4
6401         fmovd   %d22, %d6
6402         fmovd   %d24, %d8
6403         fmovd   %d26, %d10
6404         fmovd   %d28, %d12
6405         fmovd   %d30, %d14
6406         stda    %d48, [%i1]ASI_BLK_P
6407         subcc   %i3, 64, %i3
6408         add     %i1, 64, %i1
6409         bgu,pt  %ncc, .ci_unaln_000_loop
6410         prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read
6411 
6412 .ci_unaln_done:
6413         ! Handle trailing bytes, 64 to 127
6414         ! Dest long word aligned, Src not long word aligned
6415         cmp     %i2, 15
6416         bleu    %ncc, .ci_unaln_short
6417 
6418         andn    %i2, 0x7, %i3           ! %i3 is multiple of 8
6419         and     %i2, 0x7, %i2           ! residue bytes in %i2
6420         add     %i2, 8, %i2
6421         sub     %i3, 8, %i3             ! insure we don't load past end of src
6422         andn    %i0, 0x7, %o4           ! %o4 has long word aligned src address
6423         add     %i0, %i3, %i0           ! advance %i0 to after multiple of 8
6424         ldda    [%o4]%asi, %d0          ! fetch partial word
6425 .ci_unaln_by8:
6426         ldda    [%o4+8]%asi, %d2
6427         add     %o4, 8, %o4
6428         faligndata %d0, %d2, %d16
6429         subcc   %i3, 8, %i3
6430         std     %d16, [%i1]
6431         fmovd   %d2, %d0
6432         bgu,pt  %ncc, .ci_unaln_by8
6433         add     %i1, 8, %i1
6434 
6435 .ci_unaln_short:
6436         cmp     %i2, 8
6437         blt,pt  %ncc, .ci_unalnfin
6438         nop
6439         lduba   [%i0]%asi, %o4
6440         sll     %o4, 24, %o3
6441         lduba   [%i0+1]%asi, %o4
6442         sll     %o4, 16, %o4
6443         or      %o4, %o3, %o3
6444         lduba   [%i0+2]%asi, %o4
6445         sll     %o4, 8, %o4
6446         or      %o4, %o3, %o3
6447         lduba   [%i0+3]%asi, %o4
6448         or      %o4, %o3, %o3
6449         stw     %o3, [%i1]
6450         lduba   [%i0+4]%asi, %o4
6451         sll     %o4, 24, %o3
6452         lduba   [%i0+5]%asi, %o4
6453         sll     %o4, 16, %o4
6454         or      %o4, %o3, %o3
6455         lduba   [%i0+6]%asi, %o4
6456         sll     %o4, 8, %o4
6457         or      %o4, %o3, %o3
6458         lduba   [%i0+7]%asi, %o4
6459         or      %o4, %o3, %o3
6460         stw     %o3, [%i1+4]
6461         add     %i0, 8, %i0
6462         add     %i1, 8, %i1
6463         sub     %i2, 8, %i2
6464 .ci_unalnfin:
6465         cmp     %i2, 4
6466         blt,pt  %ncc, .ci_unalnz
6467         tst     %i2
6468         lduba   [%i0]%asi, %o3          ! read byte
6469         subcc   %i2, 4, %i2             ! reduce count by 4
6470         sll     %o3, 24, %o3            ! position
6471         lduba   [%i0+1]%asi, %o4
6472         sll     %o4, 16, %o4            ! position
6473         or      %o4, %o3, %o3           ! merge
6474         lduba   [%i0+2]%asi, %o4
6475         sll     %o4, 8, %o4             ! position
6476         or      %o4, %o3, %o3           ! merge
6477         add     %i1, 4, %i1             ! advance dst by 4
6478         lduba   [%i0+3]%asi, %o4
6479         add     %i0, 4, %i0             ! advance src by 4
6480         or      %o4, %o3, %o4           ! merge
6481         bnz,pt  %ncc, .ci_unaln3x
6482         stw     %o4, [%i1-4]
6483         ba      .ci_exit
6484         nop
6485 .ci_unalnz:
6486         bz,pt   %ncc, .ci_exit
6487         wr      %l5, %g0, %gsr          ! restore %gsr
6488 .ci_unaln3x:                            ! Exactly 1, 2, or 3 bytes remain
6489         subcc   %i2, 1, %i2             ! reduce count for cc test
6490         lduba   [%i0]%asi, %o4          ! load one byte
6491         bz,pt   %ncc, .ci_exit
6492         stb     %o4, [%i1]              ! store one byte
6493         lduba   [%i0+1]%asi, %o4        ! load second byte
6494         subcc   %i2, 1, %i2
6495         bz,pt   %ncc, .ci_exit
6496         stb     %o4, [%i1+1]            ! store second byte
6497         lduba   [%i0+2]%asi, %o4        ! load third byte
6498         stb     %o4, [%i1+2]            ! store third byte
6499 .ci_exit:
6500         brnz    %g1, .ci_fp_restore
6501         nop
6502         FZERO
6503         wr      %g1, %g0, %fprs
6504         ba,pt   %ncc, .ci_ex2
6505         membar  #Sync
6506 .ci_fp_restore:
6507         BLD_FP_FROMSTACK(%o4)
6508 .ci_ex2:
6509         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
6510         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6511         ret
6512         restore %g0, 0, %o0
6513 
6514 .copyin_err:
6515         ldn     [THREAD_REG + T_COPYOPS], %o4
6516         brz     %o4, 2f
6517         nop
6518         ldn     [%o4 + CP_COPYIN], %g2
6519         jmp     %g2
6520         nop
6521 2:
6522         retl
6523         mov     -1, %o0
6524 
6525 #else   /* NIAGARA_IMPL */
6526 .do_copyin:
6527         !
6528         ! Check the length and bail if zero.
6529         !
6530         tst     %o2
6531         bnz,pt  %ncc, 1f
6532         nop
6533         retl
6534         clr     %o0
6535 1:
6536         sethi   %hi(copyio_fault), %o4
6537         or      %o4, %lo(copyio_fault), %o4
6538         sethi   %hi(copyio_fault_nowindow), %o3
6539         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
6540         or      %o3, %lo(copyio_fault_nowindow), %o3
6541         membar  #Sync
6542         stn     %o3, [THREAD_REG + T_LOFAULT]
6543 
6544         mov     %o0, SAVE_SRC
6545         mov     %o1, SAVE_DST
6546         mov     %o2, SAVE_COUNT
6547 
6548         !
6549         ! Check to see if we're more than SMALL_LIMIT.
6550         !
6551         subcc   %o2, SMALL_LIMIT, %o3
6552         bgu,a,pt %ncc, .dci_ns
6553         or      %o0, %o1, %o3
6554         !
6555         ! What was previously ".small_copyin"
6556         !
6557 .dcibcp:
6558         sub     %g0, %o2, %o3           ! setup for copy loop
6559         add     %o0, %o2, %o0
6560         add     %o1, %o2, %o1
6561         ba,pt   %ncc, .dcicl
6562         lduba   [%o0 + %o3]ASI_USER, %o4
6563         !
6564         ! %o0 and %o1 point at the end and remain pointing at the end
6565         ! of their buffers. We pull things out by adding %o3 (which is
6566         ! the negation of the length) to the buffer end which gives us
6567         ! the curent location in the buffers. By incrementing %o3 we walk
6568         ! through both buffers without having to bump each buffer's
6569         ! pointer. A very fast 4 instruction loop.
6570         !
6571         .align 16
6572 .dcicl:
6573         stb     %o4, [%o1 + %o3]
6574         inccc   %o3
6575         bl,a,pt %ncc, .dcicl
6576         lduba   [%o0 + %o3]ASI_USER, %o4
6577         !
6578         ! We're done. Go home.
6579         !       
6580         membar  #Sync
6581         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
6582         retl
6583         clr     %o0
6584         !
6585         ! Try aligned copies from here.
6586         !
6587 .dci_ns:
6588         !
6589         ! See if we're single byte aligned. If we are, check the
6590         ! limit for single byte copies. If we're smaller, or equal,
6591         ! bounce to the byte for byte copy loop. Otherwise do it in
6592         ! HW (if enabled).
6593         !
6594         btst    1, %o3
6595         bz,a,pt %icc, .dcih8
6596         btst    7, %o3
6597         !
6598         ! We're single byte aligned.
6599         !
6600         sethi   %hi(hw_copy_limit_1), %o3
6601         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
6602         !
6603         ! Is HW copy on? If not do everything byte for byte.
6604         !
6605         tst     %o3
6606         bz,pn   %icc, .dcibcp
6607         subcc   %o3, %o2, %o3
6608         !
6609         ! Are we bigger than the HW limit? If not
6610         ! go to byte for byte.
6611         !
6612         bge,pt  %ncc, .dcibcp
6613         nop
6614         !
6615         ! We're big enough and copy is on. Do it with HW.
6616         !
6617         ba,pt   %ncc, .big_copyin
6618         nop
6619 .dcih8:
6620         !
6621         ! 8 byte aligned?
6622         !
6623         bnz,a   %ncc, .dcih4
6624         btst    3, %o3
6625         !
6626         ! We're eight byte aligned.
6627         !
6628         sethi   %hi(hw_copy_limit_8), %o3
6629         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
6630         !
6631         ! Is HW assist on? If not, do it with the aligned copy.
6632         !
6633         tst     %o3
6634         bz,pn   %icc, .dcis8
6635         subcc   %o3, %o2, %o3
6636         bge     %ncc, .dcis8
6637         nop
6638         ba,pt   %ncc, .big_copyin
6639         nop
6640 .dcis8:
6641         !
6642         ! Housekeeping for copy loops. Uses same idea as in the byte for
6643         ! byte copy loop above.
6644         !
6645         add     %o0, %o2, %o0
6646         add     %o1, %o2, %o1
6647         sub     %g0, %o2, %o3
6648         ba,pt   %ncc, .didebc
6649         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
6650         !
6651         ! 4 byte aligned?
6652         !
6653 .dcih4:
6654         bnz     %ncc, .dcih2
6655         sethi   %hi(hw_copy_limit_4), %o3
6656         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
6657         !
6658         ! Is HW assist on? If not, do it with the aligned copy.
6659         !
6660         tst     %o3
6661         bz,pn   %icc, .dcis4
6662         subcc   %o3, %o2, %o3
6663         !
6664         ! We're negative if our size is less than or equal to hw_copy_limit_4.
6665         !
6666         bge     %ncc, .dcis4
6667         nop
6668         ba,pt   %ncc, .big_copyin
6669         nop
6670 .dcis4:
6671         !
6672         ! Housekeeping for copy loops. Uses same idea as in the byte
6673         ! for byte copy loop above.
6674         !
6675         add     %o0, %o2, %o0
6676         add     %o1, %o2, %o1
6677         sub     %g0, %o2, %o3
6678         ba,pt   %ncc, .didfbc
6679         srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
6680 .dcih2:
6681         !
6682         ! We're two byte aligned. Check for "smallness"
6683         ! done in delay at .dcih4
6684         !
6685         bleu,pt %ncc, .dcis2
6686         sethi   %hi(hw_copy_limit_2), %o3
6687         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
6688         !
6689         ! Is HW assist on? If not, do it with the aligned copy.
6690         !
6691         tst     %o3
6692         bz,pn   %icc, .dcis2
6693         subcc   %o3, %o2, %o3
6694         !
6695         ! Are we larger than the HW limit?
6696         !
6697         bge     %ncc, .dcis2
6698         nop
6699         !
6700         ! HW assist is on and we're large enough to use it.
6701         !
6702         ba,pt   %ncc, .big_copyin
6703         nop
6704         !
6705         ! Housekeeping for copy loops. Uses same idea as in the byte
6706         ! for byte copy loop above.
6707         !
6708 .dcis2:
6709         add     %o0, %o2, %o0
6710         add     %o1, %o2, %o1
6711         sub     %g0, %o2, %o3
6712         ba,pt   %ncc, .didtbc
6713         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
6714         !
6715 .small_copyin:
6716         !
6717         ! Why are we doing this AGAIN? There are certain conditions in
6718         ! big copyin that will cause us to forgo the HW assisted copys
6719         ! and bounce back to a non-hw assisted copy. This dispatches
6720         ! those copies. Note that we branch around this in the main line
6721         ! code.
6722         !
6723         ! We make no check for limits or HW enablement here. We've
6724         ! already been told that we're a poster child so just go off
6725         ! and do it.
6726         !
6727         or      %o0, %o1, %o3
6728         btst    1, %o3
6729         bnz     %icc, .dcibcp           ! Most likely
6730         btst    7, %o3
6731         bz      %icc, .dcis8
6732         btst    3, %o3
6733         bz      %icc, .dcis4
6734         nop
6735         ba,pt   %ncc, .dcis2
6736         nop
6737         !
6738         ! Eight byte aligned copies. A steal from the original .small_copyin
6739         ! with modifications. %o2 is number of 8 byte chunks to copy. When
6740         ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
6741         ! to copy.
6742         !
6743         .align 32
6744 .didebc:
6745         ldxa    [%o0 + %o3]ASI_USER, %o4
6746         deccc   %o2
6747         stx     %o4, [%o1 + %o3]
6748         bg,pt   %ncc, .didebc
6749         addcc   %o3, 8, %o3
6750         !
6751         ! End of copy loop. Most 8 byte aligned copies end here.
6752         !
6753         bz,pt   %ncc, .dcifh
6754         nop
6755         !
6756         ! Something is left. Do it byte for byte.
6757         !
6758         ba,pt   %ncc, .dcicl
6759         lduba   [%o0 + %o3]ASI_USER, %o4
6760         !
6761         ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
6762         !
6763         .align 32
6764 .didfbc:
6765         lduwa   [%o0 + %o3]ASI_USER, %o4
6766         deccc   %o2
6767         st      %o4, [%o1 + %o3]
6768         bg,pt   %ncc, .didfbc
6769         addcc   %o3, 4, %o3
6770         !
6771         ! End of copy loop. Most 4 byte aligned copies end here.
6772         !
6773         bz,pt   %ncc, .dcifh
6774         nop
6775         !
6776         ! Something is left. Do it byte for byte.
6777         !
6778         ba,pt   %ncc, .dcicl
6779         lduba   [%o0 + %o3]ASI_USER, %o4
6780         !
6781         ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
6782         ! copy.
6783         !
6784         .align 32
6785 .didtbc:
6786         lduha   [%o0 + %o3]ASI_USER, %o4
6787         deccc   %o2
6788         sth     %o4, [%o1 + %o3]
6789         bg,pt   %ncc, .didtbc
6790         addcc   %o3, 2, %o3
6791         !
6792         ! End of copy loop. Most 2 byte aligned copies end here.
6793         !
6794         bz,pt   %ncc, .dcifh
6795         nop
6796         !
6797         ! Deal with the last byte
6798         !
6799         lduba   [%o0 + %o3]ASI_USER, %o4
6800         stb     %o4, [%o1 + %o3]
6801 .dcifh:
6802         membar  #Sync
6803         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
6804         retl
6805         clr     %o0
6806 
6807 .big_copyin:
6808         ! We're going off to do a block copy.
6809         ! Switch fault hendlers and grab a window. We
6810         ! don't do a membar #Sync since we've done only
6811         ! kernel data to this point.
6812         stn     %o4, [THREAD_REG + T_LOFAULT]
6813 
6814         ! Copy in that reach here are larger than 256 bytes. The
6815         ! hw_copy_limit_1 is set to 256. Never set this limit less
6816         ! 128 bytes.
6817         save    %sp, -SA(MINFRAME), %sp
6818 .do_blockcopyin:
6819 
6820         ! Swap src/dst since the code below is memcpy code
6821         ! and memcpy/bcopy have different calling sequences
6822         mov     %i1, %i5
6823         mov     %i0, %i1
6824         mov     %i5, %i0
6825 
6826         ! Block (64 bytes) align the destination.
6827         andcc   %i0, 0x3f, %i3          ! is dst block aligned
6828         bz      %ncc, copyin_blalign    ! dst already block aligned
6829         sub     %i3, 0x40, %i3
6830         neg     %i3                     ! bytes till dst 64 bytes aligned
6831         sub     %i2, %i3, %i2           ! update i2 with new count
6832 
6833         ! Based on source and destination alignment do
6834         ! either 8 bytes, 4 bytes, 2 bytes or byte copy.
6835 
6836         ! Is dst & src 8B aligned
6837         or      %i0, %i1, %o2
6838         andcc   %o2, 0x7, %g0
6839         bz      %ncc, .ci_alewdcp
6840         nop
6841 
6842         ! Is dst & src 4B aligned
6843         andcc   %o2, 0x3, %g0
6844         bz      %ncc, .ci_alwdcp
6845         nop
6846 
6847         ! Is dst & src 2B aligned
6848         andcc   %o2, 0x1, %g0
6849         bz      %ncc, .ci_alhlfwdcp
6850         nop
6851 
6852         ! 1B aligned
6853 1:      lduba   [%i1]ASI_USER, %o2
6854         stb     %o2, [%i0]
6855         inc     %i1
6856         deccc   %i3
6857         bgu,pt  %ncc, 1b
6858         inc     %i0
6859 
6860         ba      copyin_blalign
6861         nop
6862 
6863         ! dst & src 4B aligned
6864 .ci_alwdcp:
6865         lda     [%i1]ASI_USER, %o2
6866         st      %o2, [%i0]
6867         add     %i1, 0x4, %i1
6868         subcc   %i3, 0x4, %i3
6869         bgu,pt  %ncc, .ci_alwdcp
6870         add     %i0, 0x4, %i0
6871 
6872         ba      copyin_blalign
6873         nop
6874 
6875         ! dst & src 2B aligned
6876 .ci_alhlfwdcp:
6877         lduha   [%i1]ASI_USER, %o2
6878         stuh    %o2, [%i0]
6879         add     %i1, 0x2, %i1
6880         subcc   %i3, 0x2, %i3
6881         bgu,pt  %ncc, .ci_alhlfwdcp
6882         add     %i0, 0x2, %i0
6883 
6884         ba      copyin_blalign
6885         nop
6886 
6887         ! dst & src 8B aligned
6888 .ci_alewdcp:
6889         ldxa    [%i1]ASI_USER, %o2
6890         stx     %o2, [%i0]
6891         add     %i1, 0x8, %i1
6892         subcc   %i3, 0x8, %i3
6893         bgu,pt  %ncc, .ci_alewdcp
6894         add     %i0, 0x8, %i0
6895 
6896 copyin_blalign:
6897         andn    %i2, 0x3f, %i3          ! %i3 count is multiple of block size
6898         sub     %i2, %i3, %i2           ! Residue bytes in %i2
6899 
6900         mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
6901 
6902         andcc   %i1, 0xf, %o2           ! is src quadword aligned
6903         bz,pn   %xcc, .ci_blkcpy        ! src offset in %o2 (last 4-bits)
6904         nop
6905         cmp     %o2, 0x8
6906         bg      .ci_upper_double
6907         nop
6908         bl      .ci_lower_double
6909         nop
6910 
6911         ! Falls through when source offset is equal to 8 i.e.
6912         ! source is double word aligned.
6913         ! In this case no shift/merge of data is required
6914 
6915         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
6916         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
6917         prefetcha [%l0]ASI_USER, #one_read
6918         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6919         add     %l0, 0x40, %l0
6920 .ci_loop0:
6921         add     %i1, 0x10, %i1
6922         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6923 
6924         prefetcha [%l0]ASI_USER, #one_read
6925 
6926         stxa    %l3, [%i0+0x0]%asi
6927         stxa    %l4, [%i0+0x8]%asi
6928 
6929         add     %i1, 0x10, %i1
6930         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6931 
6932         stxa    %l5, [%i0+0x10]%asi
6933         stxa    %l2, [%i0+0x18]%asi
6934 
6935         add     %i1, 0x10, %i1
6936         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6937 
6938         stxa    %l3, [%i0+0x20]%asi
6939         stxa    %l4, [%i0+0x28]%asi
6940 
6941         add     %i1, 0x10, %i1
6942         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6943 
6944         stxa    %l5, [%i0+0x30]%asi
6945         stxa    %l2, [%i0+0x38]%asi
6946 
6947         add     %l0, 0x40, %l0
6948         subcc   %i3, 0x40, %i3
6949         bgu,pt  %xcc, .ci_loop0
6950         add     %i0, 0x40, %i0
6951         ba      .ci_blkdone
6952         add     %i1, %o2, %i1           ! increment the source by src offset
6953                                         ! the src offset was stored in %o2
6954 
6955 .ci_lower_double:
6956 
6957         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
6958         sll     %o2, 3, %o0             ! %o0 left shift
6959         mov     0x40, %o1
6960         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
6961         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
6962         prefetcha [%l0]ASI_USER, #one_read
6963         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2    ! partial data in %l2
6964                                                         ! and %l3 has complete
6965                                                         ! data
6966         add     %l0, 0x40, %l0
6967 .ci_loop1:
6968         add     %i1, 0x10, %i1
6969         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4    ! %l4 has partial data
6970                                                         ! for this read.
6971         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)        ! merge %l2, %l3 and %l4
6972                                                         ! into %l2 and %l3
6973 
6974         prefetcha [%l0]ASI_USER, #one_read
6975 
6976         stxa    %l2, [%i0+0x0]%asi
6977         stxa    %l3, [%i0+0x8]%asi
6978 
6979         add     %i1, 0x10, %i1
6980         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6981         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)        ! merge %l2 with %l5 and
6982                                                         ! %l4 from previous read
6983                                                         ! into %l4 and %l5
6984         stxa    %l4, [%i0+0x10]%asi
6985         stxa    %l5, [%i0+0x18]%asi
6986 
6987         ! Repeat the same for next 32 bytes.
6988 
6989         add     %i1, 0x10, %i1
6990         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
6991         ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
6992 
6993         stxa    %l2, [%i0+0x20]%asi
6994         stxa    %l3, [%i0+0x28]%asi
6995 
6996         add     %i1, 0x10, %i1
6997         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
6998         ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
6999 
7000         stxa    %l4, [%i0+0x30]%asi
7001         stxa    %l5, [%i0+0x38]%asi
7002 
7003         add     %l0, 0x40, %l0
7004         subcc   %i3, 0x40, %i3
7005         bgu,pt  %xcc, .ci_loop1
7006         add     %i0, 0x40, %i0
7007         ba      .ci_blkdone
7008         add     %i1, %o2, %i1           ! increment the source by src offset
7009                                         ! the src offset was stored in %o2
7010 
7011 .ci_upper_double:
7012 
7013         sub     %i1, %o2, %i1           ! align the src at 16 bytes.
7014         sub     %o2, 0x8, %o0
7015         sll     %o0, 3, %o0             ! %o0 left shift
7016         mov     0x40, %o1
7017         sub     %o1, %o0, %o1           ! %o1 right shift = (64 - left shift)
7018         andn    %i1, 0x3f, %l0          ! %l0 has block aligned source
7019         prefetcha [%l0]ASI_USER, #one_read
7020         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2    ! partial data in %l3
7021                                                         ! for this read and
7022                                                         ! no data in %l2
7023         add     %l0, 0x40, %l0
7024 .ci_loop2:
7025         add     %i1, 0x10, %i1
7026         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4    ! %l4 has complete data
7027                                                         ! and %l5 has partial
7028         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)        ! merge %l3, %l4 and %l5
7029                                                         ! into %l3 and %l4
7030         prefetcha [%l0]ASI_USER, #one_read
7031 
7032         stxa    %l3, [%i0+0x0]%asi
7033         stxa    %l4, [%i0+0x8]%asi
7034 
7035         add     %i1, 0x10, %i1
7036         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7037         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)        ! merge %l2 and %l3 with
7038                                                         ! %l5 from previous read
7039                                                         ! into %l5 and %l2
7040 
7041         stxa    %l5, [%i0+0x10]%asi
7042         stxa    %l2, [%i0+0x18]%asi
7043 
7044         ! Repeat the same for next 32 bytes.
7045 
7046         add     %i1, 0x10, %i1
7047         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7048         ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
7049 
7050         stxa    %l3, [%i0+0x20]%asi
7051         stxa    %l4, [%i0+0x28]%asi
7052 
7053         add     %i1, 0x10, %i1
7054         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7055         ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
7056 
7057         stxa    %l5, [%i0+0x30]%asi
7058         stxa    %l2, [%i0+0x38]%asi
7059 
7060         add     %l0, 0x40, %l0
7061         subcc   %i3, 0x40, %i3
7062         bgu,pt  %xcc, .ci_loop2
7063         add     %i0, 0x40, %i0
7064         ba      .ci_blkdone
7065         add     %i1, %o2, %i1           ! increment the source by src offset
7066                                         ! the src offset was stored in %o2
7067 
7068 
7069         ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
7070 .ci_blkcpy:
7071 
7072         andn    %i1, 0x3f, %o0          ! %o0 has block aligned source
7073         prefetcha [%o0]ASI_USER, #one_read
7074         add     %o0, 0x40, %o0
7075 1:
7076         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
7077         add     %i1, 0x10, %i1
7078         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
7079         add     %i1, 0x10, %i1
7080 
7081         prefetcha [%o0]ASI_USER, #one_read
7082 
7083         stxa    %l0, [%i0+0x0]%asi
7084 
7085         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
7086         add     %i1, 0x10, %i1
7087         ldda    [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
7088         add     %i1, 0x10, %i1
7089 
7090         stxa    %l1, [%i0+0x8]%asi
7091         stxa    %l2, [%i0+0x10]%asi
7092         stxa    %l3, [%i0+0x18]%asi
7093         stxa    %l4, [%i0+0x20]%asi
7094         stxa    %l5, [%i0+0x28]%asi
7095         stxa    %l6, [%i0+0x30]%asi
7096         stxa    %l7, [%i0+0x38]%asi
7097 
7098         add     %o0, 0x40, %o0
7099         subcc   %i3, 0x40, %i3
7100         bgu,pt  %xcc, 1b
7101         add     %i0, 0x40, %i0
7102 
7103 .ci_blkdone:
7104         membar  #Sync
7105 
7106         brz,pt  %i2, .copyin_exit
7107         nop
7108 
7109         ! Handle trailing bytes
7110         cmp     %i2, 0x8
7111         blu,pt  %ncc, .ci_residue
7112         nop
7113 
7114         ! Can we do some 8B ops
7115         or      %i1, %i0, %o2
7116         andcc   %o2, 0x7, %g0
7117         bnz     %ncc, .ci_last4
7118         nop
7119 
7120         ! Do 8byte ops as long as possible
7121 .ci_last8:
7122         ldxa    [%i1]ASI_USER, %o2
7123         stx     %o2, [%i0]
7124         add     %i1, 0x8, %i1
7125         sub     %i2, 0x8, %i2
7126         cmp     %i2, 0x8
7127         bgu,pt  %ncc, .ci_last8
7128         add     %i0, 0x8, %i0
7129 
7130         brz,pt  %i2, .copyin_exit
7131         nop
7132 
7133         ba      .ci_residue
7134         nop
7135 
7136 .ci_last4:
7137         ! Can we do 4B ops
7138         andcc   %o2, 0x3, %g0
7139         bnz     %ncc, .ci_last2
7140         nop
7141 1:
7142         lda     [%i1]ASI_USER, %o2
7143         st      %o2, [%i0]
7144         add     %i1, 0x4, %i1
7145         sub     %i2, 0x4, %i2
7146         cmp     %i2, 0x4
7147         bgu,pt  %ncc, 1b
7148         add     %i0, 0x4, %i0
7149 
7150         brz,pt  %i2, .copyin_exit
7151         nop
7152 
7153         ba      .ci_residue
7154         nop
7155 
7156 .ci_last2:
7157         ! Can we do 2B ops
7158         andcc   %o2, 0x1, %g0
7159         bnz     %ncc, .ci_residue
7160         nop
7161 
7162 1:
7163         lduha   [%i1]ASI_USER, %o2
7164         stuh    %o2, [%i0]
7165         add     %i1, 0x2, %i1
7166         sub     %i2, 0x2, %i2
7167         cmp     %i2, 0x2
7168         bgu,pt  %ncc, 1b
7169         add     %i0, 0x2, %i0
7170 
7171         brz,pt  %i2, .copyin_exit
7172         nop
7173 
7174         ! Copy the residue as byte copy
7175 .ci_residue:
7176         lduba   [%i1]ASI_USER, %i4
7177         stb     %i4, [%i0]
7178         inc     %i1
7179         deccc   %i2
7180         bgu,pt  %xcc, .ci_residue
7181         inc     %i0
7182 
7183 .copyin_exit:
7184         membar  #Sync
7185         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
7186         ret
7187         restore %g0, 0, %o0
7188 .copyin_err:
7189         ldn     [THREAD_REG + T_COPYOPS], %o4
7190         brz     %o4, 2f
7191         nop
7192         ldn     [%o4 + CP_COPYIN], %g2
7193         jmp     %g2
7194         nop
7195 2:
7196         retl
7197         mov     -1, %o0
7198 #endif  /* NIAGARA_IMPL */
7199         SET_SIZE(copyin)
7200 
7201         ENTRY(xcopyin)
7202         sethi   %hi(.xcopyin_err), REAL_LOFAULT
7203         b       .do_copyin
7204         or      REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
7205 .xcopyin_err:
7206         ldn     [THREAD_REG + T_COPYOPS], %o4
7207         brz     %o4, 2f
7208         nop
7209         ldn     [%o4 + CP_XCOPYIN], %g2
7210         jmp     %g2
7211         nop
7212 2:
7213         retl
7214         mov     %g1, %o0
7215         SET_SIZE(xcopyin)
7216 
7217         ENTRY(xcopyin_little)
7218         sethi   %hi(.little_err), %o4
7219         ldn     [THREAD_REG + T_LOFAULT], %o5
7220         or      %o4, %lo(.little_err), %o4
7221         membar  #Sync                           ! sync error barrier
7222         stn     %o4, [THREAD_REG + T_LOFAULT]   
7223 
7224         subcc   %g0, %o2, %o3
7225         add     %o0, %o2, %o0
7226         bz,pn   %ncc, 2f                ! check for zero bytes
7227         sub     %o2, 1, %o4
7228         add     %o0, %o4, %o0           ! start w/last byte     
7229         add     %o1, %o2, %o1
7230         lduba   [%o0+%o3]ASI_AIUSL, %o4
7231 
7232 1:      stb     %o4, [%o1+%o3]
7233         inccc   %o3
7234         sub     %o0, 2, %o0             ! get next byte
7235         bcc,a,pt %ncc, 1b
7236         lduba   [%o0+%o3]ASI_AIUSL, %o4
7237 
7238 2:      membar  #Sync                           ! sync error barrier
7239         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
7240         retl
7241         mov     %g0, %o0                ! return (0)
7242 
7243 .little_err:
7244         membar  #Sync                           ! sync error barrier
7245         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
7246         retl
7247         mov     %g1, %o0
7248         SET_SIZE(xcopyin_little)
7249 
7250 
7251 /*
7252  * Copy a block of storage - must not overlap (from + len <= to).
7253  * No fault handler installed (to be called under on_fault())
7254  */
7255 
7256         ENTRY(copyin_noerr)
7257         sethi   %hi(.copyio_noerr), REAL_LOFAULT
7258         b       .do_copyin
7259         or      REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7260 .copyio_noerr:
7261         jmp     SAVED_LOFAULT
7262         nop
7263         SET_SIZE(copyin_noerr)
7264 
7265 /*
7266  * Copy a block of storage - must not overlap (from + len <= to).
7267  * No fault handler installed (to be called under on_fault())
7268  */
7269 
7270         ENTRY(copyout_noerr)
7271         sethi   %hi(.copyio_noerr), REAL_LOFAULT
7272         b       .do_copyout
7273         or      REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
7274         SET_SIZE(copyout_noerr)
7275 
7276         .align  4
7277         DGDEF(use_hw_bcopy)
7278         .word   1
7279         DGDEF(use_hw_bzero)
7280         .word   1
7281         DGDEF(hw_copy_limit_1)
7282         .word   0x100
7283         DGDEF(hw_copy_limit_2)
7284         .word   0x200
7285         DGDEF(hw_copy_limit_4)
7286         .word   0x400
7287         DGDEF(hw_copy_limit_8)
7288         .word   0x400
7289 
7290         .align  64
7291         .section ".text"
7292 
7293 /*
7294  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
7295  * longer than 256 bytes in length using Niagara's block stores/quad store.
7296  * If the criteria for using this routine are not met then it calls bzero
7297  * and returns 1.  Otherwise 0 is returned indicating success.
7298  * Caller is responsible for ensuring use_hw_bzero is true and that
7299  * kpreempt_disable() has been called.
7300  */
7301         ! %i0 - start address
7302         ! %i1 - length of region (multiple of 64)
7303 
7304         ENTRY(hwblkclr)
7305         save    %sp, -SA(MINFRAME), %sp
7306 
7307         ! Must be block-aligned
7308         andcc   %i0, 0x3f, %g0
7309         bnz,pn  %ncc, 1f
7310         nop
7311 
7312         ! ... and must be 256 bytes or more
7313         cmp     %i1, 0x100
7314         blu,pn  %ncc, 1f
7315         nop
7316 
7317         ! ... and length must be a multiple of 64
7318         andcc   %i1, 0x3f, %g0
7319         bz,pn   %ncc, .pz_doblock
7320         mov     ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7321 
7322 1:      ! punt, call bzero but notify the caller that bzero was used
7323         mov     %i0, %o0
7324         call    bzero
7325         mov     %i1, %o1
7326         ret
7327         restore %g0, 1, %o0     ! return (1) - did not use block operations
7328 
7329         ! Already verified that there are at least 256 bytes to set
7330 .pz_doblock:
7331         stxa    %g0, [%i0+0x0]%asi
7332         stxa    %g0, [%i0+0x40]%asi
7333         stxa    %g0, [%i0+0x80]%asi
7334         stxa    %g0, [%i0+0xc0]%asi
7335 
7336         stxa    %g0, [%i0+0x8]%asi
7337         stxa    %g0, [%i0+0x10]%asi
7338         stxa    %g0, [%i0+0x18]%asi
7339         stxa    %g0, [%i0+0x20]%asi
7340         stxa    %g0, [%i0+0x28]%asi
7341         stxa    %g0, [%i0+0x30]%asi
7342         stxa    %g0, [%i0+0x38]%asi
7343 
7344         stxa    %g0, [%i0+0x48]%asi
7345         stxa    %g0, [%i0+0x50]%asi
7346         stxa    %g0, [%i0+0x58]%asi
7347         stxa    %g0, [%i0+0x60]%asi
7348         stxa    %g0, [%i0+0x68]%asi
7349         stxa    %g0, [%i0+0x70]%asi
7350         stxa    %g0, [%i0+0x78]%asi
7351 
7352         stxa    %g0, [%i0+0x88]%asi
7353         stxa    %g0, [%i0+0x90]%asi
7354         stxa    %g0, [%i0+0x98]%asi
7355         stxa    %g0, [%i0+0xa0]%asi
7356         stxa    %g0, [%i0+0xa8]%asi
7357         stxa    %g0, [%i0+0xb0]%asi
7358         stxa    %g0, [%i0+0xb8]%asi
7359 
7360         stxa    %g0, [%i0+0xc8]%asi
7361         stxa    %g0, [%i0+0xd0]%asi
7362         stxa    %g0, [%i0+0xd8]%asi
7363         stxa    %g0, [%i0+0xe0]%asi
7364         stxa    %g0, [%i0+0xe8]%asi
7365         stxa    %g0, [%i0+0xf0]%asi
7366         stxa    %g0, [%i0+0xf8]%asi
7367 
7368         sub     %i1, 0x100, %i1
7369         cmp     %i1, 0x100
7370         bgu,pt  %ncc, .pz_doblock
7371         add     %i0, 0x100, %i0
7372 
7373 2:
7374         ! Check if more than 64 bytes to set
7375         cmp     %i1,0x40
7376         blu     %ncc, .pz_finish
7377         nop
7378 
7379 3:
7380         stxa    %g0, [%i0+0x0]%asi
7381         stxa    %g0, [%i0+0x8]%asi
7382         stxa    %g0, [%i0+0x10]%asi
7383         stxa    %g0, [%i0+0x18]%asi
7384         stxa    %g0, [%i0+0x20]%asi
7385         stxa    %g0, [%i0+0x28]%asi
7386         stxa    %g0, [%i0+0x30]%asi
7387         stxa    %g0, [%i0+0x38]%asi
7388 
7389         subcc   %i1, 0x40, %i1
7390         bgu,pt  %ncc, 3b
7391         add     %i0, 0x40, %i0
7392 
7393 .pz_finish:
7394         membar  #Sync
7395         ret
7396         restore %g0, 0, %o0             ! return (bzero or not)
7397         SET_SIZE(hwblkclr)
7398 
7399         /*
7400          * Copy 32 bytes of data from src (%o0) to dst (%o1)
7401          * using physical addresses.
7402          */
7403         ENTRY_NP(hw_pa_bcopy32)
7404         rdpr    %pstate, %g1
7405         andn    %g1, PSTATE_IE, %g2
7406         wrpr    %g0, %g2, %pstate
7407 
7408         ldxa    [%o0]ASI_MEM, %o2
7409         add     %o0, 8, %o0
7410         ldxa    [%o0]ASI_MEM, %o3
7411         add     %o0, 8, %o0
7412         ldxa    [%o0]ASI_MEM, %o4
7413         add     %o0, 8, %o0
7414         ldxa    [%o0]ASI_MEM, %o5
7415         stxa    %o2, [%o1]ASI_MEM
7416         add     %o1, 8, %o1
7417         stxa    %o3, [%o1]ASI_MEM
7418         add     %o1, 8, %o1
7419         stxa    %o4, [%o1]ASI_MEM
7420         add     %o1, 8, %o1
7421         stxa    %o5, [%o1]ASI_MEM
7422 
7423         membar  #Sync
7424         retl
7425         wrpr    %g0, %g1, %pstate
7426         SET_SIZE(hw_pa_bcopy32)
7427 
7428 /*
7429  * Zero a block of storage.
7430  *
7431  * uzero is used by the kernel to zero a block in user address space.
7432  */
7433 
7434 /*
7435  * Control flow of the bzero/kzero/uzero routine.
7436  *
7437  *      For fewer than 7 bytes stores, bytes will be zeroed.
7438  *
7439  *      For less than 15 bytes stores, align the address on 4 byte boundary.
7440  *      Then store as many 4-byte chunks, followed by trailing bytes.
7441  *
7442  *      For sizes greater than 15 bytes, align the address on 8 byte boundary.
7443  *      if (count > 128) {
7444  *              store as many 8-bytes chunks to block align the address
7445  *              store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
7446  *              store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
7447  *      }
7448  *      Store as many 8-byte chunks, followed by trailing bytes.
7449  */
7450 
7451         ENTRY(uzero)
7452         !
7453         ! Set a new lo_fault handler only if we came in with one
7454         ! already specified.
7455         !
7456         wr      %g0, ASI_USER, %asi
7457         ldn     [THREAD_REG + T_LOFAULT], %o5
7458         tst     %o5
7459         bz,pt   %ncc, .do_zero
7460         sethi   %hi(.zeroerr), %o2
7461         or      %o2, %lo(.zeroerr), %o2
7462         membar  #Sync
7463         ba,pt   %ncc, .do_zero
7464         stn     %o2, [THREAD_REG + T_LOFAULT]
7465 
7466         ENTRY(kzero)
7467         !
7468         ! Always set a lo_fault handler
7469         !
7470         wr      %g0, ASI_P, %asi
7471         ldn     [THREAD_REG + T_LOFAULT], %o5
7472         sethi   %hi(.zeroerr), %o2
7473         or      %o5, LOFAULT_SET, %o5
7474         or      %o2, %lo(.zeroerr), %o2
7475         membar  #Sync
7476         ba,pt   %ncc, .do_zero
7477         stn     %o2, [THREAD_REG + T_LOFAULT]
7478 
7479 /*
7480  * We got here because of a fault during kzero or if
7481  * uzero or bzero was called with t_lofault non-zero.
7482  * Otherwise we've already run screaming from the room.
7483  * Errno value is in %g1. Note that we're here iff
7484  * we did set t_lofault.
7485  */
7486 .zeroerr:
7487         !
7488         ! Undo asi register setting. Just set it to be the
7489         ! kernel default without checking.
7490         !
7491         wr      %g0, ASI_P, %asi
7492 
7493         !
7494         ! We did set t_lofault. It may well have been zero coming in.
7495         !
7496 1:
7497         tst     %o5
7498         membar #Sync
7499         bne,pn  %ncc, 3f                
7500         andncc  %o5, LOFAULT_SET, %o5
7501 2:
7502         !
7503         ! Old handler was zero. Just return the error.
7504         !
7505         retl                            ! return
7506         mov     %g1, %o0                ! error code from %g1
7507 3:
7508         !
7509         ! We're here because %o5 was non-zero. It was non-zero
7510         ! because either LOFAULT_SET was present, a previous fault
7511         ! handler was present or both. In all cases we need to reset
7512         ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
7513         ! before we either simply return the error or we invoke the
7514         ! previously specified handler.
7515         !
7516         be      %ncc, 2b
7517         stn     %o5, [THREAD_REG + T_LOFAULT]
7518         jmp     %o5                     ! goto real handler
7519         nop
7520         SET_SIZE(kzero)
7521         SET_SIZE(uzero)
7522 
7523 /*
7524  * Zero a block of storage.
7525  */
7526 
7527         ENTRY(bzero)
7528         wr      %g0, ASI_P, %asi
7529 
7530         ldn     [THREAD_REG + T_LOFAULT], %o5   ! save old vector
7531         tst     %o5
7532         bz,pt   %ncc, .do_zero
7533         sethi   %hi(.zeroerr), %o2
7534         or      %o2, %lo(.zeroerr), %o2
7535         membar  #Sync                           ! sync error barrier
7536         stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
7537 
7538 .do_zero:
7539         cmp     %o1, 7
7540         blu,pn  %ncc, .byteclr
7541         nop
7542 
7543         cmp     %o1, 15
7544         blu,pn  %ncc, .wdalign
7545         nop
7546 
7547         andcc   %o0, 7, %o3             ! is add aligned on a 8 byte bound
7548         bz,pt   %ncc, .blkalign         ! already double aligned
7549         sub     %o3, 8, %o3             ! -(bytes till double aligned)
7550         add     %o1, %o3, %o1           ! update o1 with new count
7551 
7552 1:
7553         stba    %g0, [%o0]%asi
7554         inccc   %o3
7555         bl,pt   %ncc, 1b
7556         inc     %o0
7557 
7558         ! Now address is double aligned
7559 .blkalign:
7560         cmp     %o1, 0x80               ! check if there are 128 bytes to set
7561         blu,pn  %ncc, .bzero_small
7562         mov     %o1, %o3
7563 
7564         sethi   %hi(use_hw_bzero), %o2
7565         ld      [%o2 + %lo(use_hw_bzero)], %o2
7566         tst     %o2
7567         bz      %ncc, .bzero_small
7568         mov     %o1, %o3
7569 
7570         rd      %asi, %o3
7571         wr      %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
7572         cmp     %o3, ASI_P
7573         bne,a   %ncc, .algnblk
7574         wr      %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
7575 
7576 .algnblk:
7577         andcc   %o0, 0x3f, %o3          ! is block aligned?
7578         bz,pt   %ncc, .bzero_blk
7579         sub     %o3, 0x40, %o3          ! -(bytes till block aligned)
7580         add     %o1, %o3, %o1           ! o1 is the remainder
7581         
7582         ! Clear -(%o3) bytes till block aligned
7583 1:
7584         stxa    %g0, [%o0]%asi
7585         addcc   %o3, 8, %o3
7586         bl,pt   %ncc, 1b
7587         add     %o0, 8, %o0
7588 
7589 .bzero_blk:
7590         and     %o1, 0x3f, %o3          ! calc bytes left after blk clear
7591         andn    %o1, 0x3f, %o4          ! calc size of blocks in bytes
7592 
7593         cmp     %o4, 0x100              ! 256 bytes or more
7594         blu,pn  %ncc, 3f
7595         nop
7596 
7597 2:
7598         stxa    %g0, [%o0+0x0]%asi
7599         stxa    %g0, [%o0+0x40]%asi
7600         stxa    %g0, [%o0+0x80]%asi
7601         stxa    %g0, [%o0+0xc0]%asi
7602 
7603         stxa    %g0, [%o0+0x8]%asi
7604         stxa    %g0, [%o0+0x10]%asi
7605         stxa    %g0, [%o0+0x18]%asi
7606         stxa    %g0, [%o0+0x20]%asi
7607         stxa    %g0, [%o0+0x28]%asi
7608         stxa    %g0, [%o0+0x30]%asi
7609         stxa    %g0, [%o0+0x38]%asi
7610 
7611         stxa    %g0, [%o0+0x48]%asi
7612         stxa    %g0, [%o0+0x50]%asi
7613         stxa    %g0, [%o0+0x58]%asi
7614         stxa    %g0, [%o0+0x60]%asi
7615         stxa    %g0, [%o0+0x68]%asi
7616         stxa    %g0, [%o0+0x70]%asi
7617         stxa    %g0, [%o0+0x78]%asi
7618 
7619         stxa    %g0, [%o0+0x88]%asi
7620         stxa    %g0, [%o0+0x90]%asi
7621         stxa    %g0, [%o0+0x98]%asi
7622         stxa    %g0, [%o0+0xa0]%asi
7623         stxa    %g0, [%o0+0xa8]%asi
7624         stxa    %g0, [%o0+0xb0]%asi
7625         stxa    %g0, [%o0+0xb8]%asi
7626 
7627         stxa    %g0, [%o0+0xc8]%asi
7628         stxa    %g0, [%o0+0xd0]%asi
7629         stxa    %g0, [%o0+0xd8]%asi
7630         stxa    %g0, [%o0+0xe0]%asi
7631         stxa    %g0, [%o0+0xe8]%asi
7632         stxa    %g0, [%o0+0xf0]%asi
7633         stxa    %g0, [%o0+0xf8]%asi
7634 
7635         sub     %o4, 0x100, %o4
7636         cmp     %o4, 0x100
7637         bgu,pt  %ncc, 2b
7638         add     %o0, 0x100, %o0
7639 
7640 3:
7641         ! ... check if 64 bytes to set
7642         cmp     %o4, 0x40
7643         blu     %ncc, .bzero_blk_done
7644         nop
7645 
7646 4:
7647         stxa    %g0, [%o0+0x0]%asi
7648         stxa    %g0, [%o0+0x8]%asi
7649         stxa    %g0, [%o0+0x10]%asi
7650         stxa    %g0, [%o0+0x18]%asi
7651         stxa    %g0, [%o0+0x20]%asi
7652         stxa    %g0, [%o0+0x28]%asi
7653         stxa    %g0, [%o0+0x30]%asi
7654         stxa    %g0, [%o0+0x38]%asi
7655 
7656         subcc   %o4, 0x40, %o4
7657         bgu,pt  %ncc, 3b
7658         add     %o0, 0x40, %o0
7659 
7660 .bzero_blk_done:
7661         membar  #Sync
7662         !
7663         ! Undo asi register setting.
7664         !
7665         rd      %asi, %o4
7666         wr      %g0, ASI_P, %asi
7667         cmp     %o4, ASI_BLK_INIT_ST_QUAD_LDD_P
7668         bne,a   %ncc, .bzero_small
7669         wr      %g0, ASI_USER, %asi
7670 
7671 .bzero_small:
7672         ! Set the remaining doubles
7673         subcc   %o3, 8, %o3             ! Can we store any doubles?
7674         blu,pn  %ncc, .byteclr
7675         and     %o1, 7, %o1             ! calc bytes left after doubles
7676 
7677 .dbclr:
7678         stxa    %g0, [%o0]%asi          ! Clear the doubles
7679         subcc   %o3, 8, %o3
7680         bgeu,pt %ncc, .dbclr
7681         add     %o0, 8, %o0
7682 
7683         ba      .byteclr
7684         nop
7685 
7686 .wdalign:                       
7687         andcc   %o0, 3, %o3             ! is add aligned on a word boundary
7688         bz,pn   %ncc, .wdclr
7689         andn    %o1, 3, %o3             ! create word sized count in %o3
7690 
7691         dec     %o1                     ! decrement count
7692         stba    %g0, [%o0]%asi          ! clear a byte
7693         ba      .wdalign
7694         inc     %o0                     ! next byte
7695 
7696 .wdclr:
7697         sta     %g0, [%o0]%asi          ! 4-byte clearing loop
7698         subcc   %o3, 4, %o3
7699         bnz,pt  %ncc, .wdclr
7700         inc     4, %o0
7701 
7702         and     %o1, 3, %o1             ! leftover count, if any
7703 
7704 .byteclr:
7705         ! Set the leftover bytes
7706         brz     %o1, .bzero_exit
7707         nop
7708 
7709 7:
7710         deccc   %o1                     ! byte clearing loop
7711         stba    %g0, [%o0]%asi
7712         bgu,pt  %ncc, 7b
7713         inc     %o0
7714 
7715 .bzero_exit:
7716         !
7717         ! We're just concerned with whether t_lofault was set
7718         ! when we came in. We end up here from either kzero()
7719         ! or bzero(). kzero() *always* sets a lofault handler.
7720         ! It ors LOFAULT_SET into %o5 to indicate it has done
7721         ! this even if the value of %o5 is otherwise zero.
7722         ! bzero() sets a lofault handler *only* if one was
7723         ! previously set. Accordingly we need to examine
7724         ! %o5 and if it is non-zero be sure to clear LOFAULT_SET
7725         ! before resetting the error handler.
7726         !
7727         tst     %o5
7728         bz      %ncc, 1f
7729         andn    %o5, LOFAULT_SET, %o5
7730         membar  #Sync                           ! sync error barrier
7731         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
7732 1:
7733         retl
7734         clr     %o0                     ! return (0)
7735 
7736         SET_SIZE(bzero)