1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 #include <sys/param.h>
  30 #include <sys/errno.h>
  31 #include <sys/asm_linkage.h>
  32 #include <sys/vtrace.h>
  33 #include <sys/machthread.h>
  34 #include <sys/clock.h>
  35 #include <sys/asi.h>
  36 #include <sys/fsr.h>
  37 #include <sys/privregs.h>
  38 
  39 #if !defined(lint)
  40 #include "assym.h"
  41 #endif  /* lint */
  42 
  43 
  44 /*
  45  * Pseudo-code to aid in understanding the control flow of the
  46  * bcopy routine.
  47  *
  48  * On entry to bcopy:
  49  *
  50  *      %l6 = curthread->t_lofault;
  51  *      used_block_copy = FALSE;                        ! %l6 |= 1
  52  *      if (%l6 != NULL) {
  53  *              curthread->t_lofault = .copyerr;
  54  *              caller_error_handler = TRUE             ! %l6 |= 2
  55  *      }
  56  *
  57  *      if (length < VIS_COPY)
  58  *              goto regular_copy;
  59  *
  60  *      if (!use_vis)
  61  *              goto_regular_copy;
  62  *
  63  *      if (curthread->t_lwp == NULL) {
  64  *              ! Kernel threads do not have pcb's in which to store
  65  *              ! the floating point state, disallow preemption during
  66  *              ! the copy.
  67  *              kpreempt_disable(curthread);
  68  *      }
  69  *
  70  *      old_fprs = %fprs;
  71  *      old_gsr = %gsr;
  72  *      if (%fprs.fef) {
  73  *              ! If we need to save 4 blocks of fpregs then make sure
  74  *              ! the length is still appropriate for that extra overhead.
  75  *              if (length < (large_length + (64 * 4))) {
  76  *                      if (curthread->t_lwp == NULL)
  77  *                              kpreempt_enable(curthread);
  78  *                      goto regular_copy;
  79  *              }
  80  *              %fprs.fef = 1;
  81  *              save current fpregs on stack using blockstore
  82  *      } else {
  83  *              %fprs.fef = 1;
  84  *      }
  85  *
  86  *      used_block_copy = 1;                            ! %l6 |= 1
  87  *      do_blockcopy_here;
  88  *
  89  * In lofault handler:
  90  *      curthread->t_lofault = .copyerr2;
  91  *      Continue on with the normal exit handler
  92  *
  93  * On exit:
  94  *      call_kpreempt = 0;
  95  *      if (used_block_copy) {                          ! %l6 & 1
  96  *              %gsr = old_gsr;
  97  *              if (old_fprs & FPRS_FEF)
  98  *                      restore fpregs from stack using blockload
  99  *              else
 100  *                      zero fpregs
 101  *              %fprs = old_fprs;
 102  *              if (curthread->t_lwp == NULL) {
 103  *                      kpreempt_enable(curthread);
 104  *                      call_kpreempt = 1;
 105  *              }
 106  *      }
 107  *      curthread->t_lofault = (%l6 & ~3);
 108  *      if (call_kpreempt)
 109  *              kpreempt(%pil);
 110  *      return (0)
 111  *
 112  * In second lofault handler (.copyerr2):
 113  *      We've tried to restore fp state from the stack and failed.  To
 114  *      prevent from returning with a corrupted fp state, we will panic.
 115  */
 116 
 117 /*
 118  * Notes on preserving existing fp state:
 119  *
 120  * When a copyOP decides to use fp we may have to preserve existing
 121  * floating point state.  It is not the caller's state that we need to
 122  * preserve - the rest of the kernel does not use fp and, anyway, fp
 123  * registers are volatile across a call.  Some examples:
 124  *
 125  *      - userland has fp state and is interrupted (device interrupt
 126  *        or trap) and within the interrupt/trap handling we use
 127  *        bcopy()
 128  *      - another (higher level) interrupt or trap handler uses bcopy
 129  *        while a bcopy from an earlier interrupt is still active
 130  *      - an asynchronous error trap occurs while fp state exists (in
 131  *        userland or in kernel copy) and the tl0 component of the handling
 132  *        uses bcopy
 133  *      - a user process with fp state incurs a copy-on-write fault and
 134  *        hwblkpagecopy always uses fp
 135  *
 136  * We therefore need a per-call place in which to preserve fp state -
 137  * using our stack is ideal (and since fp copy cannot be leaf optimized
 138  * because of calls it makes, this is no hardship).
 139  *
 140  * To make sure that floating point state is always saved and restored
 141  * correctly, the following "big rules" must be followed when the floating
 142  * point registers will be used:
 143  *
 144  * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 145  *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 146  *    use.  Bit 2 (BCOPY_FLAG) indicates that the call was to bcopy.
 147  *
 148  * 2. The FPUSED flag indicates that all FP state has been successfully stored
 149  *    on the stack.  It should not be set until this save has been completed.
 150  *
 151  * 3. The FPUSED flag should not be cleared on exit until all FP state has
 152  *    been restored from the stack.  If an error occurs while restoring
 153  *    data from the stack, the error handler can check this flag to see if
 154  *    a restore is necessary.
 155  *
 156  * 4. Code run under the new lofault handler must be kept to a minimum.  In
 157  *    particular, any calls to kpreempt() should not be made until after the
 158  *    lofault handler has been restored.
 159  */
 160 
 161 /*
 162  * This shadows sys/machsystm.h which can't be included due to the lack of
 163  * _ASM guards in include files it references. Change it here, change it there.
 164  */
 165 #define VIS_COPY_THRESHOLD 900
 166 
 167 /*
 168  * Less then or equal this number of bytes we will always copy byte-for-byte
 169  */
 170 #define SMALL_LIMIT     7
 171 
 172 /*
 173  * Flags set in the lower bits of the t_lofault address:
 174  * FPUSED_FLAG: The FP registers were in use and must be restored
 175  * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
 176  * COPY_FLAGS: Both of the above
 177  *
 178  * Other flags:
 179  * KPREEMPT_FLAG: kpreempt needs to be called
 180  */
 181 #define FPUSED_FLAG     1
 182 #define BCOPY_FLAG      2
 183 #define COPY_FLAGS      (FPUSED_FLAG | BCOPY_FLAG)
 184 #define KPREEMPT_FLAG   4
 185 
 186 /*
 187  * Size of stack frame in order to accomodate a 64-byte aligned
 188  * floating-point register save area and 2 32-bit temp locations.
 189  */
 190 #define HWCOPYFRAMESIZE ((64 * 5) + (2 * 4))
 191 
 192 #define SAVED_FPREGS_OFFSET     (64 * 5)
 193 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 4)
 194 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 4)
 195 
 196 /*
 197  * Common macros used by the various versions of the block copy
 198  * routines in this file.
 199  */
 200 
 201 #define FZERO                           \
 202         fzero   %f0                     ;\
 203         fzero   %f2                     ;\
 204         faddd   %f0, %f2, %f4           ;\
 205         fmuld   %f0, %f2, %f6           ;\
 206         faddd   %f0, %f2, %f8           ;\
 207         fmuld   %f0, %f2, %f10          ;\
 208         faddd   %f0, %f2, %f12          ;\
 209         fmuld   %f0, %f2, %f14          ;\
 210         faddd   %f0, %f2, %f16          ;\
 211         fmuld   %f0, %f2, %f18          ;\
 212         faddd   %f0, %f2, %f20          ;\
 213         fmuld   %f0, %f2, %f22          ;\
 214         faddd   %f0, %f2, %f24          ;\
 215         fmuld   %f0, %f2, %f26          ;\
 216         faddd   %f0, %f2, %f28          ;\
 217         fmuld   %f0, %f2, %f30          ;\
 218         faddd   %f0, %f2, %f32          ;\
 219         fmuld   %f0, %f2, %f34          ;\
 220         faddd   %f0, %f2, %f36          ;\
 221         fmuld   %f0, %f2, %f38          ;\
 222         faddd   %f0, %f2, %f40          ;\
 223         fmuld   %f0, %f2, %f42          ;\
 224         faddd   %f0, %f2, %f44          ;\
 225         fmuld   %f0, %f2, %f46          ;\
 226         faddd   %f0, %f2, %f48          ;\
 227         fmuld   %f0, %f2, %f50          ;\
 228         faddd   %f0, %f2, %f52          ;\
 229         fmuld   %f0, %f2, %f54          ;\
 230         faddd   %f0, %f2, %f56          ;\
 231         fmuld   %f0, %f2, %f58          ;\
 232         faddd   %f0, %f2, %f60          ;\
 233         fmuld   %f0, %f2, %f62
 234 
 235 
 236 #define FALIGN_D0                       \
 237         faligndata %d0, %d2, %d48       ;\
 238         faligndata %d2, %d4, %d50       ;\
 239         faligndata %d4, %d6, %d52       ;\
 240         faligndata %d6, %d8, %d54       ;\
 241         faligndata %d8, %d10, %d56      ;\
 242         faligndata %d10, %d12, %d58     ;\
 243         faligndata %d12, %d14, %d60     ;\
 244         faligndata %d14, %d16, %d62
 245 
 246 #define FALIGN_D16                      \
 247         faligndata %d16, %d18, %d48     ;\
 248         faligndata %d18, %d20, %d50     ;\
 249         faligndata %d20, %d22, %d52     ;\
 250         faligndata %d22, %d24, %d54     ;\
 251         faligndata %d24, %d26, %d56     ;\
 252         faligndata %d26, %d28, %d58     ;\
 253         faligndata %d28, %d30, %d60     ;\
 254         faligndata %d30, %d32, %d62
 255 
 256 #define FALIGN_D32                      \
 257         faligndata %d32, %d34, %d48     ;\
 258         faligndata %d34, %d36, %d50     ;\
 259         faligndata %d36, %d38, %d52     ;\
 260         faligndata %d38, %d40, %d54     ;\
 261         faligndata %d40, %d42, %d56     ;\
 262         faligndata %d42, %d44, %d58     ;\
 263         faligndata %d44, %d46, %d60     ;\
 264         faligndata %d46, %d0, %d62
 265 
 266 #define FALIGN_D2                       \
 267         faligndata %d2, %d4, %d48       ;\
 268         faligndata %d4, %d6, %d50       ;\
 269         faligndata %d6, %d8, %d52       ;\
 270         faligndata %d8, %d10, %d54      ;\
 271         faligndata %d10, %d12, %d56     ;\
 272         faligndata %d12, %d14, %d58     ;\
 273         faligndata %d14, %d16, %d60     ;\
 274         faligndata %d16, %d18, %d62
 275 
 276 #define FALIGN_D18                      \
 277         faligndata %d18, %d20, %d48     ;\
 278         faligndata %d20, %d22, %d50     ;\
 279         faligndata %d22, %d24, %d52     ;\
 280         faligndata %d24, %d26, %d54     ;\
 281         faligndata %d26, %d28, %d56     ;\
 282         faligndata %d28, %d30, %d58     ;\
 283         faligndata %d30, %d32, %d60     ;\
 284         faligndata %d32, %d34, %d62
 285 
 286 #define FALIGN_D34                      \
 287         faligndata %d34, %d36, %d48     ;\
 288         faligndata %d36, %d38, %d50     ;\
 289         faligndata %d38, %d40, %d52     ;\
 290         faligndata %d40, %d42, %d54     ;\
 291         faligndata %d42, %d44, %d56     ;\
 292         faligndata %d44, %d46, %d58     ;\
 293         faligndata %d46, %d0, %d60      ;\
 294         faligndata %d0, %d2, %d62
 295 
 296 #define FALIGN_D4                       \
 297         faligndata %d4, %d6, %d48       ;\
 298         faligndata %d6, %d8, %d50       ;\
 299         faligndata %d8, %d10, %d52      ;\
 300         faligndata %d10, %d12, %d54     ;\
 301         faligndata %d12, %d14, %d56     ;\
 302         faligndata %d14, %d16, %d58     ;\
 303         faligndata %d16, %d18, %d60     ;\
 304         faligndata %d18, %d20, %d62
 305 
 306 #define FALIGN_D20                      \
 307         faligndata %d20, %d22, %d48     ;\
 308         faligndata %d22, %d24, %d50     ;\
 309         faligndata %d24, %d26, %d52     ;\
 310         faligndata %d26, %d28, %d54     ;\
 311         faligndata %d28, %d30, %d56     ;\
 312         faligndata %d30, %d32, %d58     ;\
 313         faligndata %d32, %d34, %d60     ;\
 314         faligndata %d34, %d36, %d62
 315 
 316 #define FALIGN_D36                      \
 317         faligndata %d36, %d38, %d48     ;\
 318         faligndata %d38, %d40, %d50     ;\
 319         faligndata %d40, %d42, %d52     ;\
 320         faligndata %d42, %d44, %d54     ;\
 321         faligndata %d44, %d46, %d56     ;\
 322         faligndata %d46, %d0, %d58      ;\
 323         faligndata %d0, %d2, %d60       ;\
 324         faligndata %d2, %d4, %d62
 325 
 326 #define FALIGN_D6                       \
 327         faligndata %d6, %d8, %d48       ;\
 328         faligndata %d8, %d10, %d50      ;\
 329         faligndata %d10, %d12, %d52     ;\
 330         faligndata %d12, %d14, %d54     ;\
 331         faligndata %d14, %d16, %d56     ;\
 332         faligndata %d16, %d18, %d58     ;\
 333         faligndata %d18, %d20, %d60     ;\
 334         faligndata %d20, %d22, %d62
 335 
 336 #define FALIGN_D22                      \
 337         faligndata %d22, %d24, %d48     ;\
 338         faligndata %d24, %d26, %d50     ;\
 339         faligndata %d26, %d28, %d52     ;\
 340         faligndata %d28, %d30, %d54     ;\
 341         faligndata %d30, %d32, %d56     ;\
 342         faligndata %d32, %d34, %d58     ;\
 343         faligndata %d34, %d36, %d60     ;\
 344         faligndata %d36, %d38, %d62
 345 
 346 #define FALIGN_D38                      \
 347         faligndata %d38, %d40, %d48     ;\
 348         faligndata %d40, %d42, %d50     ;\
 349         faligndata %d42, %d44, %d52     ;\
 350         faligndata %d44, %d46, %d54     ;\
 351         faligndata %d46, %d0, %d56      ;\
 352         faligndata %d0, %d2, %d58       ;\
 353         faligndata %d2, %d4, %d60       ;\
 354         faligndata %d4, %d6, %d62
 355 
 356 #define FALIGN_D8                       \
 357         faligndata %d8, %d10, %d48      ;\
 358         faligndata %d10, %d12, %d50     ;\
 359         faligndata %d12, %d14, %d52     ;\
 360         faligndata %d14, %d16, %d54     ;\
 361         faligndata %d16, %d18, %d56     ;\
 362         faligndata %d18, %d20, %d58     ;\
 363         faligndata %d20, %d22, %d60     ;\
 364         faligndata %d22, %d24, %d62
 365 
 366 #define FALIGN_D24                      \
 367         faligndata %d24, %d26, %d48     ;\
 368         faligndata %d26, %d28, %d50     ;\
 369         faligndata %d28, %d30, %d52     ;\
 370         faligndata %d30, %d32, %d54     ;\
 371         faligndata %d32, %d34, %d56     ;\
 372         faligndata %d34, %d36, %d58     ;\
 373         faligndata %d36, %d38, %d60     ;\
 374         faligndata %d38, %d40, %d62
 375 
 376 #define FALIGN_D40                      \
 377         faligndata %d40, %d42, %d48     ;\
 378         faligndata %d42, %d44, %d50     ;\
 379         faligndata %d44, %d46, %d52     ;\
 380         faligndata %d46, %d0, %d54      ;\
 381         faligndata %d0, %d2, %d56       ;\
 382         faligndata %d2, %d4, %d58       ;\
 383         faligndata %d4, %d6, %d60       ;\
 384         faligndata %d6, %d8, %d62
 385 
 386 #define FALIGN_D10                      \
 387         faligndata %d10, %d12, %d48     ;\
 388         faligndata %d12, %d14, %d50     ;\
 389         faligndata %d14, %d16, %d52     ;\
 390         faligndata %d16, %d18, %d54     ;\
 391         faligndata %d18, %d20, %d56     ;\
 392         faligndata %d20, %d22, %d58     ;\
 393         faligndata %d22, %d24, %d60     ;\
 394         faligndata %d24, %d26, %d62
 395 
 396 #define FALIGN_D26                      \
 397         faligndata %d26, %d28, %d48     ;\
 398         faligndata %d28, %d30, %d50     ;\
 399         faligndata %d30, %d32, %d52     ;\
 400         faligndata %d32, %d34, %d54     ;\
 401         faligndata %d34, %d36, %d56     ;\
 402         faligndata %d36, %d38, %d58     ;\
 403         faligndata %d38, %d40, %d60     ;\
 404         faligndata %d40, %d42, %d62
 405 
 406 #define FALIGN_D42                      \
 407         faligndata %d42, %d44, %d48     ;\
 408         faligndata %d44, %d46, %d50     ;\
 409         faligndata %d46, %d0, %d52      ;\
 410         faligndata %d0, %d2, %d54       ;\
 411         faligndata %d2, %d4, %d56       ;\
 412         faligndata %d4, %d6, %d58       ;\
 413         faligndata %d6, %d8, %d60       ;\
 414         faligndata %d8, %d10, %d62
 415 
 416 #define FALIGN_D12                      \
 417         faligndata %d12, %d14, %d48     ;\
 418         faligndata %d14, %d16, %d50     ;\
 419         faligndata %d16, %d18, %d52     ;\
 420         faligndata %d18, %d20, %d54     ;\
 421         faligndata %d20, %d22, %d56     ;\
 422         faligndata %d22, %d24, %d58     ;\
 423         faligndata %d24, %d26, %d60     ;\
 424         faligndata %d26, %d28, %d62
 425 
 426 #define FALIGN_D28                      \
 427         faligndata %d28, %d30, %d48     ;\
 428         faligndata %d30, %d32, %d50     ;\
 429         faligndata %d32, %d34, %d52     ;\
 430         faligndata %d34, %d36, %d54     ;\
 431         faligndata %d36, %d38, %d56     ;\
 432         faligndata %d38, %d40, %d58     ;\
 433         faligndata %d40, %d42, %d60     ;\
 434         faligndata %d42, %d44, %d62
 435 
 436 #define FALIGN_D44                      \
 437         faligndata %d44, %d46, %d48     ;\
 438         faligndata %d46, %d0, %d50      ;\
 439         faligndata %d0, %d2, %d52       ;\
 440         faligndata %d2, %d4, %d54       ;\
 441         faligndata %d4, %d6, %d56       ;\
 442         faligndata %d6, %d8, %d58       ;\
 443         faligndata %d8, %d10, %d60      ;\
 444         faligndata %d10, %d12, %d62
 445 
 446 #define FALIGN_D14                      \
 447         faligndata %d14, %d16, %d48     ;\
 448         faligndata %d16, %d18, %d50     ;\
 449         faligndata %d18, %d20, %d52     ;\
 450         faligndata %d20, %d22, %d54     ;\
 451         faligndata %d22, %d24, %d56     ;\
 452         faligndata %d24, %d26, %d58     ;\
 453         faligndata %d26, %d28, %d60     ;\
 454         faligndata %d28, %d30, %d62
 455 
 456 #define FALIGN_D30                      \
 457         faligndata %d30, %d32, %d48     ;\
 458         faligndata %d32, %d34, %d50     ;\
 459         faligndata %d34, %d36, %d52     ;\
 460         faligndata %d36, %d38, %d54     ;\
 461         faligndata %d38, %d40, %d56     ;\
 462         faligndata %d40, %d42, %d58     ;\
 463         faligndata %d42, %d44, %d60     ;\
 464         faligndata %d44, %d46, %d62
 465 
 466 #define FALIGN_D46                      \
 467         faligndata %d46, %d0, %d48      ;\
 468         faligndata %d0, %d2, %d50       ;\
 469         faligndata %d2, %d4, %d52       ;\
 470         faligndata %d4, %d6, %d54       ;\
 471         faligndata %d6, %d8, %d56       ;\
 472         faligndata %d8, %d10, %d58      ;\
 473         faligndata %d10, %d12, %d60     ;\
 474         faligndata %d12, %d14, %d62
 475 
 476 
 477 /*
 478  * Copy a block of storage, returning an error code if `from' or
 479  * `to' takes a kernel pagefault which cannot be resolved.
 480  * Returns errno value on pagefault error, 0 if all ok
 481  */
 482 
 483 
 484 
 485 #if defined(lint)
 486 
 487 /* ARGSUSED */
 488 int
 489 kcopy(const void *from, void *to, size_t count)
 490 { return(0); }
 491 
 492 #else   /* lint */
 493 
 494         .seg    ".text"
 495         .align  4
 496 
 497         ENTRY(kcopy)
 498 
 499         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 500         set     .copyerr, %l6           ! copyerr is lofault value
 501         ldn     [THREAD_REG + T_LOFAULT], %l7   ! save existing handler
 502         membar  #Sync                   ! sync error barrier (see copy.s)
 503         stn     %l6, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 504         !
 505         ! Note that we carefully do *not* flag the setting of
 506         ! t_lofault.
 507         !
 508         ba,pt   %ncc, .do_copy          ! common code
 509           mov   %l7, %l6
 510 
 511 /*
 512  * We got here because of a fault during kcopy or bcopy if a fault
 513  * handler existed when bcopy was called. 
 514  * Errno value is in %g1.
 515  */
 516 .copyerr:
 517         set     .copyerr2, %l1
 518         membar  #Sync                   ! sync error barrier
 519         stn     %l1, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 520         btst    FPUSED_FLAG, %l6
 521         bz      %icc, 1f
 522           and   %l6, BCOPY_FLAG, %l1    ! copy flag to %l1
 523 
 524         membar  #Sync
 525 
 526         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
 527         wr      %o2, 0, %gsr
 528 
 529         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
 530         btst    FPRS_FEF, %o3
 531         bz      %icc, 4f
 532           nop
 533 
 534         ! restore fpregs from stack
 535         membar  #Sync
 536         add     %fp, STACK_BIAS - 257, %o2
 537         and     %o2, -64, %o2
 538         ldda    [%o2]ASI_BLK_P, %d0
 539         add     %o2, 64, %o2
 540         ldda    [%o2]ASI_BLK_P, %d16
 541         add     %o2, 64, %o2
 542         ldda    [%o2]ASI_BLK_P, %d32
 543         add     %o2, 64, %o2
 544         ldda    [%o2]ASI_BLK_P, %d48
 545         membar  #Sync
 546 
 547         ba,pt   %ncc, 2f
 548           wr    %o3, 0, %fprs           ! restore fprs
 549 
 550 4:
 551         FZERO                           ! zero all of the fpregs
 552         wr      %o3, 0, %fprs           ! restore fprs
 553 
 554 2:      ldn     [THREAD_REG + T_LWP], %o2
 555         tst     %o2
 556         bnz,pt  %ncc, 1f
 557           nop
 558 
 559         ldsb    [THREAD_REG + T_PREEMPT], %l0
 560         deccc   %l0
 561         bnz,pn  %ncc, 1f
 562           stb   %l0, [THREAD_REG + T_PREEMPT]
 563 
 564         ! Check for a kernel preemption request
 565         ldn     [THREAD_REG + T_CPU], %l0
 566         ldub    [%l0 + CPU_KPRUNRUN], %l0
 567         tst     %l0
 568         bnz,a,pt        %ncc, 1f        ! Need to call kpreempt?
 569           or    %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
 570 
 571         !
 572         ! Need to cater for the different expectations of kcopy
 573         ! and bcopy. kcopy will *always* set a t_lofault handler
 574         ! If it fires, we're expected to just return the error code
 575         ! and *not* to invoke any existing error handler. As far as
 576         ! bcopy is concerned, we only set t_lofault if there was an
 577         ! existing lofault handler. In that case we're expected to
 578         ! invoke the previously existing handler after restting the
 579         ! t_lofault value.
 580         !
 581 1:
 582         andn    %l6, COPY_FLAGS, %l6    ! remove flags from lofault address
 583         membar  #Sync                   ! sync error barrier
 584         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 585 
 586         ! call kpreempt if necessary
 587         btst    KPREEMPT_FLAG, %l1
 588         bz,pt   %icc, 2f
 589           nop
 590         call    kpreempt
 591           rdpr  %pil, %o0       ! pass %pil
 592 2:
 593         btst    BCOPY_FLAG, %l1
 594         bnz,pn  %ncc, 3f
 595           nop
 596         ret
 597         restore %g1, 0, %o0
 598 
 599 3:
 600         !
 601         ! We're here via bcopy. There *must* have been an error handler
 602         ! in place otheerwise we would have died a nasty death already.
 603         !
 604         jmp     %l6                             ! goto real handler
 605         restore %g0, 0, %o0                     ! dispose of copy window
 606 
 607 /*
 608  * We got here because of a fault in .copyerr.  We can't safely restore fp
 609  * state, so we panic.
 610  */
 611 fp_panic_msg:
 612         .asciz  "Unable to restore fp state after copy operation"
 613 
 614         .align  4
 615 .copyerr2:
 616         set     fp_panic_msg, %o0
 617         call    panic
 618           nop
 619         SET_SIZE(kcopy)
 620 #endif  /* lint */
 621 
 622 
 623 /*
 624  * Copy a block of storage - must not overlap (from + len <= to).
 625  * Registers: l6 - saved t_lofault
 626  *
 627  * Copy a page of memory.
 628  * Assumes double word alignment and a count >= 256.
 629  */
 630 #if defined(lint)
 631 
 632 /* ARGSUSED */
 633 void
 634 bcopy(const void *from, void *to, size_t count)
 635 {}
 636 
 637 #else   /* lint */
 638 
 639         ENTRY(bcopy)
 640 
 641         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 642         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
 643         tst     %l6
 644         !
 645         ! We've already captured whether t_lofault was zero on entry.
 646         ! We need to mark ourselves as being from bcopy since both
 647         ! kcopy and bcopy use the same code path. If BCOPY_FLAG is set
 648         ! and the saved lofault was zero, we won't reset lofault on
 649         ! returning.
 650         !
 651         or      %l6, BCOPY_FLAG, %l6
 652         bz,pt   %ncc, .do_copy
 653         sethi   %hi(.copyerr), %o2
 654         or      %o2, %lo(.copyerr), %o2
 655         membar  #Sync                   ! sync error barrier
 656         stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
 657 
 658 .do_copy:
 659         cmp     %i2, 12                 ! for small counts
 660         blu     %ncc, .bytecp           ! just copy bytes
 661           .empty
 662 
 663         cmp     %i2, VIS_COPY_THRESHOLD ! for large counts
 664         blu,pt  %ncc, .bcb_punt
 665           .empty
 666 
 667         !
 668         ! Check to see if VIS acceleration is enabled
 669         !
 670         sethi   %hi(use_hw_bcopy), %o2
 671         ld      [%o2 + %lo(use_hw_bcopy)], %o2
 672         tst     %o2
 673         bz,pn   %icc, .bcb_punt
 674           nop
 675 
 676         subcc   %i1, %i0, %i3
 677         bneg,a,pn %ncc, 1f
 678         neg     %i3
 679 1:
 680         /*
 681          * Compare against 256 since we should be checking block addresses
 682          * and (dest & ~63) - (src & ~63) can be 3 blocks even if
 683          * src = dest + (64 * 3) + 63.
 684          */
 685         cmp     %i3, 256
 686         blu,pn  %ncc, .bcb_punt
 687           nop
 688 
 689         ldn     [THREAD_REG + T_LWP], %o3
 690         tst     %o3
 691         bnz,pt  %ncc, 1f
 692           nop
 693 
 694         ! kpreempt_disable();
 695         ldsb    [THREAD_REG + T_PREEMPT], %o2
 696         inc     %o2
 697         stb     %o2, [THREAD_REG + T_PREEMPT]
 698 
 699 1:
 700         rd      %fprs, %o2              ! check for unused fp
 701         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
 702         btst    FPRS_FEF, %o2
 703         bz,a    %icc, .do_blockcopy
 704           wr    %g0, FPRS_FEF, %fprs
 705 
 706 .bcb_fpregs_inuse:
 707         cmp     %i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
 708         bgeu    %ncc, 1f                !  if we have to save the fpregs)
 709           nop
 710 
 711         tst     %o3
 712         bnz,pt  %ncc, .bcb_punt
 713           nop
 714 
 715         ldsb    [THREAD_REG + T_PREEMPT], %l0
 716         deccc   %l0
 717         bnz,pn  %icc, .bcb_punt
 718           stb   %l0, [THREAD_REG + T_PREEMPT]
 719 
 720         ! Check for a kernel preemption request
 721         ldn     [THREAD_REG + T_CPU], %l0
 722         ldub    [%l0 + CPU_KPRUNRUN], %l0
 723         tst     %l0
 724         bz,pt   %icc, .bcb_punt
 725           nop
 726 
 727         ! Attempt to preempt
 728         call    kpreempt
 729           rdpr    %pil, %o0               ! pass %pil
 730 
 731         ba,pt   %ncc, .bcb_punt
 732           nop
 733 
 734 1:
 735         wr      %g0, FPRS_FEF, %fprs
 736 
 737         ! save in-use fpregs on stack
 738         membar  #Sync
 739         add     %fp, STACK_BIAS - 257, %o2
 740         and     %o2, -64, %o2
 741         stda    %d0, [%o2]ASI_BLK_P
 742         add     %o2, 64, %o2
 743         stda    %d16, [%o2]ASI_BLK_P
 744         add     %o2, 64, %o2
 745         stda    %d32, [%o2]ASI_BLK_P
 746         add     %o2, 64, %o2
 747         stda    %d48, [%o2]ASI_BLK_P
 748         membar  #Sync
 749 
 750 .do_blockcopy:
 751         membar  #StoreStore|#StoreLoad|#LoadStore
 752 
 753         rd      %gsr, %o2
 754         st      %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
 755 
 756         ! Set the lower bit in the saved t_lofault to indicate
 757         ! that we need to clear the %fprs register on the way
 758         ! out
 759         or      %l6, FPUSED_FLAG, %l6
 760 
 761         ! Swap src/dst since the code below is memcpy code
 762         ! and memcpy/bcopy have different calling sequences
 763         mov     %i1, %i5
 764         mov     %i0, %i1
 765         mov     %i5, %i0
 766 
 767 !!! This code is nearly identical to the version in the sun4u
 768 !!! libc_psr.  Most bugfixes made to that file should be
 769 !!! merged into this routine.
 770 
 771         andcc   %i0, 7, %o3
 772         bz,pt   %ncc, blkcpy
 773         sub     %o3, 8, %o3
 774         neg     %o3
 775         sub     %i2, %o3, %i2
 776 
 777         ! Align Destination on double-word boundary
 778 
 779 2:      ldub    [%i1], %o4
 780         inc     %i1
 781         inc     %i0
 782         deccc   %o3
 783         bgu     %ncc, 2b
 784         stb     %o4, [%i0 - 1]
 785 blkcpy: 
 786         andcc   %i0, 63, %i3
 787         bz,pn   %ncc, blalign           ! now block aligned
 788         sub     %i3, 64, %i3
 789         neg     %i3                     ! bytes till block aligned
 790         sub     %i2, %i3, %i2           ! update %i2 with new count
 791 
 792         ! Copy %i3 bytes till dst is block (64 byte) aligned. use
 793         ! double word copies.
 794 
 795         alignaddr %i1, %g0, %g1
 796         ldd     [%g1], %d0
 797         add     %g1, 8, %g1
 798 6:
 799         ldd     [%g1], %d2
 800         add     %g1, 8, %g1
 801         subcc   %i3, 8, %i3
 802         faligndata %d0, %d2, %d8
 803         std     %d8, [%i0]
 804         add     %i1, 8, %i1
 805         bz,pn   %ncc, blalign
 806         add     %i0, 8, %i0
 807         ldd     [%g1], %d0
 808         add     %g1, 8, %g1
 809         subcc   %i3, 8, %i3
 810         faligndata %d2, %d0, %d8
 811         std     %d8, [%i0]
 812         add     %i1, 8, %i1
 813         bgu,pn  %ncc, 6b
 814         add     %i0, 8, %i0
 815  
 816 blalign:
 817         membar  #StoreLoad
 818         ! %i2 = total length
 819         ! %i3 = blocks  (length - 64) / 64
 820         ! %i4 = doubles remaining  (length - blocks)
 821         sub     %i2, 64, %i3
 822         andn    %i3, 63, %i3
 823         sub     %i2, %i3, %i4
 824         andn    %i4, 7, %i4
 825         sub     %i4, 16, %i4
 826         sub     %i2, %i4, %i2
 827         sub     %i2, %i3, %i2
 828 
 829         andn    %i1, 0x3f, %l7          ! blk aligned address
 830         alignaddr %i1, %g0, %g0         ! gen %gsr
 831 
 832         srl     %i1, 3, %l5             ! bits 3,4,5 are now least sig in  %l5
 833         andcc   %l5, 7, %i5             ! mask everything except bits 1,2 3
 834         add     %i1, %i4, %i1
 835         add     %i1, %i3, %i1
 836 
 837         ldda    [%l7]ASI_BLK_P, %d0
 838         add     %l7, 64, %l7
 839         ldda    [%l7]ASI_BLK_P, %d16
 840         add     %l7, 64, %l7
 841         ldda    [%l7]ASI_BLK_P, %d32
 842         add     %l7, 64, %l7
 843         sub     %i3, 128, %i3
 844 
 845         ! switch statement to get us to the right 8 byte blk within a
 846         ! 64 byte block
 847         cmp      %i5, 4
 848         bgeu,a   hlf
 849         cmp      %i5, 6
 850         cmp      %i5, 2
 851         bgeu,a   sqtr
 852         nop
 853         cmp      %i5, 1
 854         be,a     seg1
 855         nop
 856         ba,pt    %ncc, seg0
 857         nop
 858 sqtr:
 859         be,a     seg2
 860         nop
 861         ba,pt    %ncc, seg3
 862         nop
 863 
 864 hlf:
 865         bgeu,a   fqtr
 866         nop      
 867         cmp      %i5, 5
 868         be,a     seg5
 869         nop
 870         ba,pt    %ncc, seg4
 871         nop
 872 fqtr:
 873         be,a     seg6
 874         nop
 875         ba,pt    %ncc, seg7
 876         nop
 877         
 878 
 879 seg0:
 880         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
 881         FALIGN_D0
 882         ldda    [%l7]ASI_BLK_P, %d0
 883         stda    %d48, [%i0]ASI_BLK_P
 884         add     %l7, 64, %l7
 885         subcc   %i3, 64, %i3
 886         bz,pn   %ncc, 0f
 887         add     %i0, 64, %i0
 888         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
 889         FALIGN_D16
 890         ldda    [%l7]ASI_BLK_P, %d16
 891         stda    %d48, [%i0]ASI_BLK_P
 892         add     %l7, 64, %l7
 893         subcc   %i3, 64, %i3
 894         bz,pn   %ncc, 1f
 895         add     %i0, 64, %i0
 896         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
 897         FALIGN_D32
 898         ldda    [%l7]ASI_BLK_P, %d32
 899         stda    %d48, [%i0]ASI_BLK_P
 900         add     %l7, 64, %l7
 901         subcc   %i3, 64, %i3
 902         bz,pn   %ncc, 2f
 903         add     %i0, 64, %i0
 904         ba,a,pt %ncc, seg0
 905 
 906 0:
 907         FALIGN_D16
 908         stda    %d48, [%i0]ASI_BLK_P
 909         add     %i0, 64, %i0
 910         membar  #Sync
 911         FALIGN_D32
 912         stda    %d48, [%i0]ASI_BLK_P
 913         ba,pt   %ncc, blkd0
 914         add     %i0, 64, %i0
 915 
 916 1:
 917         FALIGN_D32
 918         stda    %d48, [%i0]ASI_BLK_P
 919         add     %i0, 64, %i0
 920         membar  #Sync
 921         FALIGN_D0
 922         stda    %d48, [%i0]ASI_BLK_P
 923         ba,pt   %ncc, blkd16
 924         add     %i0, 64, %i0
 925 
 926 2:
 927         FALIGN_D0
 928         stda    %d48, [%i0]ASI_BLK_P
 929         add     %i0, 64, %i0
 930         membar  #Sync
 931         FALIGN_D16
 932         stda    %d48, [%i0]ASI_BLK_P
 933         ba,pt   %ncc, blkd32
 934         add     %i0, 64, %i0
 935 
 936 seg1:
 937         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
 938         FALIGN_D2
 939         ldda    [%l7]ASI_BLK_P, %d0
 940         stda    %d48, [%i0]ASI_BLK_P
 941         add     %l7, 64, %l7
 942         subcc   %i3, 64, %i3
 943         bz,pn   %ncc, 0f
 944         add     %i0, 64, %i0
 945         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
 946         FALIGN_D18
 947         ldda    [%l7]ASI_BLK_P, %d16
 948         stda    %d48, [%i0]ASI_BLK_P
 949         add     %l7, 64, %l7
 950         subcc   %i3, 64, %i3
 951         bz,pn   %ncc, 1f
 952         add     %i0, 64, %i0
 953         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
 954         FALIGN_D34
 955         ldda    [%l7]ASI_BLK_P, %d32
 956         stda    %d48, [%i0]ASI_BLK_P
 957         add     %l7, 64, %l7
 958         subcc   %i3, 64, %i3
 959         bz,pn   %ncc, 2f
 960         add     %i0, 64, %i0
 961         ba,a,pt %ncc, seg1
 962 0:
 963         FALIGN_D18
 964         stda    %d48, [%i0]ASI_BLK_P
 965         add     %i0, 64, %i0
 966         membar  #Sync
 967         FALIGN_D34
 968         stda    %d48, [%i0]ASI_BLK_P
 969         ba,pt   %ncc, blkd2
 970         add     %i0, 64, %i0
 971 
 972 1:
 973         FALIGN_D34
 974         stda    %d48, [%i0]ASI_BLK_P
 975         add     %i0, 64, %i0
 976         membar  #Sync
 977         FALIGN_D2
 978         stda    %d48, [%i0]ASI_BLK_P
 979         ba,pt   %ncc, blkd18
 980         add     %i0, 64, %i0
 981 
 982 2:
 983         FALIGN_D2
 984         stda    %d48, [%i0]ASI_BLK_P
 985         add     %i0, 64, %i0
 986         membar  #Sync
 987         FALIGN_D18
 988         stda    %d48, [%i0]ASI_BLK_P
 989         ba,pt   %ncc, blkd34
 990         add     %i0, 64, %i0
 991 
 992 seg2:
 993         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
 994         FALIGN_D4
 995         ldda    [%l7]ASI_BLK_P, %d0
 996         stda    %d48, [%i0]ASI_BLK_P
 997         add     %l7, 64, %l7
 998         subcc   %i3, 64, %i3
 999         bz,pn   %ncc, 0f
1000         add     %i0, 64, %i0
1001         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1002         FALIGN_D20
1003         ldda    [%l7]ASI_BLK_P, %d16
1004         stda    %d48, [%i0]ASI_BLK_P
1005         add     %l7, 64, %l7
1006         subcc   %i3, 64, %i3
1007         bz,pn   %ncc, 1f
1008         add     %i0, 64, %i0
1009         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1010         FALIGN_D36
1011         ldda    [%l7]ASI_BLK_P, %d32
1012         stda    %d48, [%i0]ASI_BLK_P
1013         add     %l7, 64, %l7
1014         subcc   %i3, 64, %i3
1015         bz,pn   %ncc, 2f
1016         add     %i0, 64, %i0
1017         ba,a,pt %ncc, seg2
1018 
1019 0:
1020         FALIGN_D20
1021         stda    %d48, [%i0]ASI_BLK_P
1022         add     %i0, 64, %i0
1023         membar  #Sync
1024         FALIGN_D36
1025         stda    %d48, [%i0]ASI_BLK_P
1026         ba,pt   %ncc, blkd4
1027         add     %i0, 64, %i0
1028 
1029 1:
1030         FALIGN_D36
1031         stda    %d48, [%i0]ASI_BLK_P
1032         add     %i0, 64, %i0
1033         membar  #Sync
1034         FALIGN_D4
1035         stda    %d48, [%i0]ASI_BLK_P
1036         ba,pt   %ncc, blkd20
1037         add     %i0, 64, %i0
1038 
1039 2:
1040         FALIGN_D4
1041         stda    %d48, [%i0]ASI_BLK_P
1042         add     %i0, 64, %i0
1043         membar  #Sync
1044         FALIGN_D20
1045         stda    %d48, [%i0]ASI_BLK_P
1046         ba,pt   %ncc, blkd36
1047         add     %i0, 64, %i0
1048 
1049 seg3:
1050         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1051         FALIGN_D6
1052         ldda    [%l7]ASI_BLK_P, %d0
1053         stda    %d48, [%i0]ASI_BLK_P
1054         add     %l7, 64, %l7
1055         subcc   %i3, 64, %i3
1056         bz,pn   %ncc, 0f
1057         add     %i0, 64, %i0
1058         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1059         FALIGN_D22
1060         ldda    [%l7]ASI_BLK_P, %d16
1061         stda    %d48, [%i0]ASI_BLK_P
1062         add     %l7, 64, %l7
1063         subcc   %i3, 64, %i3
1064         bz,pn   %ncc, 1f
1065         add     %i0, 64, %i0
1066         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1067         FALIGN_D38
1068         ldda    [%l7]ASI_BLK_P, %d32
1069         stda    %d48, [%i0]ASI_BLK_P
1070         add     %l7, 64, %l7
1071         subcc   %i3, 64, %i3
1072         bz,pn   %ncc, 2f
1073         add     %i0, 64, %i0
1074         ba,a,pt %ncc, seg3
1075 
1076 0:
1077         FALIGN_D22
1078         stda    %d48, [%i0]ASI_BLK_P
1079         add     %i0, 64, %i0
1080         membar  #Sync
1081         FALIGN_D38
1082         stda    %d48, [%i0]ASI_BLK_P
1083         ba,pt   %ncc, blkd6
1084         add     %i0, 64, %i0
1085 
1086 1:
1087         FALIGN_D38
1088         stda    %d48, [%i0]ASI_BLK_P
1089         add     %i0, 64, %i0
1090         membar  #Sync
1091         FALIGN_D6
1092         stda    %d48, [%i0]ASI_BLK_P
1093         ba,pt   %ncc, blkd22
1094         add     %i0, 64, %i0
1095 
1096 2:
1097         FALIGN_D6
1098         stda    %d48, [%i0]ASI_BLK_P
1099         add     %i0, 64, %i0
1100         membar  #Sync
1101         FALIGN_D22
1102         stda    %d48, [%i0]ASI_BLK_P
1103         ba,pt   %ncc, blkd38
1104         add     %i0, 64, %i0
1105 
1106 seg4:
1107         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1108         FALIGN_D8
1109         ldda    [%l7]ASI_BLK_P, %d0
1110         stda    %d48, [%i0]ASI_BLK_P
1111         add     %l7, 64, %l7
1112         subcc   %i3, 64, %i3
1113         bz,pn   %ncc, 0f
1114         add     %i0, 64, %i0
1115         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1116         FALIGN_D24
1117         ldda    [%l7]ASI_BLK_P, %d16
1118         stda    %d48, [%i0]ASI_BLK_P
1119         add     %l7, 64, %l7
1120         subcc   %i3, 64, %i3
1121         bz,pn   %ncc, 1f
1122         add     %i0, 64, %i0
1123         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1124         FALIGN_D40
1125         ldda    [%l7]ASI_BLK_P, %d32
1126         stda    %d48, [%i0]ASI_BLK_P
1127         add     %l7, 64, %l7
1128         subcc   %i3, 64, %i3
1129         bz,pn   %ncc, 2f
1130         add     %i0, 64, %i0
1131         ba,a,pt %ncc, seg4
1132 
1133 0:
1134         FALIGN_D24
1135         stda    %d48, [%i0]ASI_BLK_P
1136         add     %i0, 64, %i0
1137         membar  #Sync
1138         FALIGN_D40
1139         stda    %d48, [%i0]ASI_BLK_P
1140         ba,pt   %ncc, blkd8
1141         add     %i0, 64, %i0
1142 
1143 1:
1144         FALIGN_D40
1145         stda    %d48, [%i0]ASI_BLK_P
1146         add     %i0, 64, %i0
1147         membar  #Sync
1148         FALIGN_D8
1149         stda    %d48, [%i0]ASI_BLK_P
1150         ba,pt   %ncc, blkd24
1151         add     %i0, 64, %i0
1152 
1153 2:
1154         FALIGN_D8
1155         stda    %d48, [%i0]ASI_BLK_P
1156         add     %i0, 64, %i0
1157         membar  #Sync
1158         FALIGN_D24
1159         stda    %d48, [%i0]ASI_BLK_P
1160         ba,pt   %ncc, blkd40
1161         add     %i0, 64, %i0
1162 
1163 seg5:
1164         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1165         FALIGN_D10
1166         ldda    [%l7]ASI_BLK_P, %d0
1167         stda    %d48, [%i0]ASI_BLK_P
1168         add     %l7, 64, %l7
1169         subcc   %i3, 64, %i3
1170         bz,pn   %ncc, 0f
1171         add     %i0, 64, %i0
1172         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1173         FALIGN_D26
1174         ldda    [%l7]ASI_BLK_P, %d16
1175         stda    %d48, [%i0]ASI_BLK_P
1176         add     %l7, 64, %l7
1177         subcc   %i3, 64, %i3
1178         bz,pn   %ncc, 1f
1179         add     %i0, 64, %i0
1180         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1181         FALIGN_D42
1182         ldda    [%l7]ASI_BLK_P, %d32
1183         stda    %d48, [%i0]ASI_BLK_P
1184         add     %l7, 64, %l7
1185         subcc   %i3, 64, %i3
1186         bz,pn   %ncc, 2f
1187         add     %i0, 64, %i0
1188         ba,a,pt %ncc, seg5
1189 
1190 0:
1191         FALIGN_D26
1192         stda    %d48, [%i0]ASI_BLK_P
1193         add     %i0, 64, %i0
1194         membar  #Sync
1195         FALIGN_D42
1196         stda    %d48, [%i0]ASI_BLK_P
1197         ba,pt   %ncc, blkd10
1198         add     %i0, 64, %i0
1199 
1200 1:
1201         FALIGN_D42
1202         stda    %d48, [%i0]ASI_BLK_P
1203         add     %i0, 64, %i0
1204         membar  #Sync
1205         FALIGN_D10
1206         stda    %d48, [%i0]ASI_BLK_P
1207         ba,pt   %ncc, blkd26
1208         add     %i0, 64, %i0
1209 
1210 2:
1211         FALIGN_D10
1212         stda    %d48, [%i0]ASI_BLK_P
1213         add     %i0, 64, %i0
1214         membar  #Sync
1215         FALIGN_D26
1216         stda    %d48, [%i0]ASI_BLK_P
1217         ba,pt   %ncc, blkd42
1218         add     %i0, 64, %i0
1219 
1220 seg6:
1221         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1222         FALIGN_D12
1223         ldda    [%l7]ASI_BLK_P, %d0
1224         stda    %d48, [%i0]ASI_BLK_P
1225         add     %l7, 64, %l7
1226         subcc   %i3, 64, %i3
1227         bz,pn   %ncc, 0f
1228         add     %i0, 64, %i0
1229         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1230         FALIGN_D28
1231         ldda    [%l7]ASI_BLK_P, %d16
1232         stda    %d48, [%i0]ASI_BLK_P
1233         add     %l7, 64, %l7
1234         subcc   %i3, 64, %i3
1235         bz,pn   %ncc, 1f
1236         add     %i0, 64, %i0
1237         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1238         FALIGN_D44
1239         ldda    [%l7]ASI_BLK_P, %d32
1240         stda    %d48, [%i0]ASI_BLK_P
1241         add     %l7, 64, %l7
1242         subcc   %i3, 64, %i3
1243         bz,pn   %ncc, 2f
1244         add     %i0, 64, %i0
1245         ba,a,pt %ncc, seg6
1246 
1247 0:
1248         FALIGN_D28
1249         stda    %d48, [%i0]ASI_BLK_P
1250         add     %i0, 64, %i0
1251         membar  #Sync
1252         FALIGN_D44
1253         stda    %d48, [%i0]ASI_BLK_P
1254         ba,pt   %ncc, blkd12
1255         add     %i0, 64, %i0
1256 
1257 1:
1258         FALIGN_D44
1259         stda    %d48, [%i0]ASI_BLK_P
1260         add     %i0, 64, %i0
1261         membar  #Sync
1262         FALIGN_D12
1263         stda    %d48, [%i0]ASI_BLK_P
1264         ba,pt   %ncc, blkd28
1265         add     %i0, 64, %i0
1266 
1267 2:
1268         FALIGN_D12
1269         stda    %d48, [%i0]ASI_BLK_P
1270         add     %i0, 64, %i0
1271         membar  #Sync
1272         FALIGN_D28
1273         stda    %d48, [%i0]ASI_BLK_P
1274         ba,pt   %ncc, blkd44
1275         add     %i0, 64, %i0
1276 
1277 seg7:
1278         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1279         FALIGN_D14
1280         ldda    [%l7]ASI_BLK_P, %d0
1281         stda    %d48, [%i0]ASI_BLK_P
1282         add     %l7, 64, %l7
1283         subcc   %i3, 64, %i3
1284         bz,pn   %ncc, 0f
1285         add     %i0, 64, %i0
1286         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1287         FALIGN_D30
1288         ldda    [%l7]ASI_BLK_P, %d16
1289         stda    %d48, [%i0]ASI_BLK_P
1290         add     %l7, 64, %l7
1291         subcc   %i3, 64, %i3
1292         bz,pn   %ncc, 1f
1293         add     %i0, 64, %i0
1294         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1295         FALIGN_D46
1296         ldda    [%l7]ASI_BLK_P, %d32
1297         stda    %d48, [%i0]ASI_BLK_P
1298         add     %l7, 64, %l7
1299         subcc   %i3, 64, %i3
1300         bz,pn   %ncc, 2f
1301         add     %i0, 64, %i0
1302         ba,a,pt %ncc, seg7
1303 
1304 0:
1305         FALIGN_D30
1306         stda    %d48, [%i0]ASI_BLK_P
1307         add     %i0, 64, %i0
1308         membar  #Sync
1309         FALIGN_D46
1310         stda    %d48, [%i0]ASI_BLK_P
1311         ba,pt   %ncc, blkd14
1312         add     %i0, 64, %i0
1313 
1314 1:
1315         FALIGN_D46
1316         stda    %d48, [%i0]ASI_BLK_P
1317         add     %i0, 64, %i0
1318         membar  #Sync
1319         FALIGN_D14
1320         stda    %d48, [%i0]ASI_BLK_P
1321         ba,pt   %ncc, blkd30
1322         add     %i0, 64, %i0
1323 
1324 2:
1325         FALIGN_D14
1326         stda    %d48, [%i0]ASI_BLK_P
1327         add     %i0, 64, %i0
1328         membar  #Sync
1329         FALIGN_D30
1330         stda    %d48, [%i0]ASI_BLK_P
1331         ba,pt   %ncc, blkd46
1332         add     %i0, 64, %i0
1333 
1334 
1335         !
1336         ! dribble out the last partial block
1337         !
1338 blkd0:
1339         subcc   %i4, 8, %i4
1340         blu,pn  %ncc, blkdone
1341         faligndata %d0, %d2, %d48
1342         std     %d48, [%i0]
1343         add     %i0, 8, %i0
1344 blkd2:
1345         subcc   %i4, 8, %i4
1346         blu,pn  %ncc, blkdone
1347         faligndata %d2, %d4, %d48
1348         std     %d48, [%i0]
1349         add     %i0, 8, %i0
1350 blkd4:
1351         subcc   %i4, 8, %i4
1352         blu,pn  %ncc, blkdone
1353         faligndata %d4, %d6, %d48
1354         std     %d48, [%i0]
1355         add     %i0, 8, %i0
1356 blkd6:
1357         subcc   %i4, 8, %i4
1358         blu,pn  %ncc, blkdone
1359         faligndata %d6, %d8, %d48
1360         std     %d48, [%i0]
1361         add     %i0, 8, %i0
1362 blkd8:
1363         subcc   %i4, 8, %i4
1364         blu,pn  %ncc, blkdone
1365         faligndata %d8, %d10, %d48
1366         std     %d48, [%i0]
1367         add     %i0, 8, %i0
1368 blkd10:
1369         subcc   %i4, 8, %i4
1370         blu,pn  %ncc, blkdone
1371         faligndata %d10, %d12, %d48
1372         std     %d48, [%i0]
1373         add     %i0, 8, %i0
1374 blkd12:
1375         subcc   %i4, 8, %i4
1376         blu,pn  %ncc, blkdone
1377         faligndata %d12, %d14, %d48
1378         std     %d48, [%i0]
1379         add     %i0, 8, %i0
1380 blkd14:
1381         subcc   %i4, 8, %i4
1382         blu,pn  %ncc, blkdone
1383         fsrc1   %d14, %d0
1384         ba,a,pt %ncc, blkleft
1385 
1386 blkd16:
1387         subcc   %i4, 8, %i4
1388         blu,pn  %ncc, blkdone
1389         faligndata %d16, %d18, %d48
1390         std     %d48, [%i0]
1391         add     %i0, 8, %i0
1392 blkd18:
1393         subcc   %i4, 8, %i4
1394         blu,pn  %ncc, blkdone
1395         faligndata %d18, %d20, %d48
1396         std     %d48, [%i0]
1397         add     %i0, 8, %i0
1398 blkd20:
1399         subcc   %i4, 8, %i4
1400         blu,pn  %ncc, blkdone
1401         faligndata %d20, %d22, %d48
1402         std     %d48, [%i0]
1403         add     %i0, 8, %i0
1404 blkd22:
1405         subcc   %i4, 8, %i4
1406         blu,pn  %ncc, blkdone
1407         faligndata %d22, %d24, %d48
1408         std     %d48, [%i0]
1409         add     %i0, 8, %i0
1410 blkd24:
1411         subcc   %i4, 8, %i4
1412         blu,pn  %ncc, blkdone
1413         faligndata %d24, %d26, %d48
1414         std     %d48, [%i0]
1415         add     %i0, 8, %i0
1416 blkd26:
1417         subcc   %i4, 8, %i4
1418         blu,pn  %ncc, blkdone
1419         faligndata %d26, %d28, %d48
1420         std     %d48, [%i0]
1421         add     %i0, 8, %i0
1422 blkd28:
1423         subcc   %i4, 8, %i4
1424         blu,pn  %ncc, blkdone
1425         faligndata %d28, %d30, %d48
1426         std     %d48, [%i0]
1427         add     %i0, 8, %i0
1428 blkd30:
1429         subcc   %i4, 8, %i4
1430         blu,pn  %ncc, blkdone
1431         fsrc1   %d30, %d0
1432         ba,a,pt %ncc, blkleft
1433 blkd32:
1434         subcc   %i4, 8, %i4
1435         blu,pn  %ncc, blkdone
1436         faligndata %d32, %d34, %d48
1437         std     %d48, [%i0]
1438         add     %i0, 8, %i0
1439 blkd34:
1440         subcc   %i4, 8, %i4
1441         blu,pn  %ncc, blkdone
1442         faligndata %d34, %d36, %d48
1443         std     %d48, [%i0]
1444         add     %i0, 8, %i0
1445 blkd36:
1446         subcc   %i4, 8, %i4
1447         blu,pn  %ncc, blkdone
1448         faligndata %d36, %d38, %d48
1449         std     %d48, [%i0]
1450         add     %i0, 8, %i0
1451 blkd38:
1452         subcc   %i4, 8, %i4
1453         blu,pn  %ncc, blkdone
1454         faligndata %d38, %d40, %d48
1455         std     %d48, [%i0]
1456         add     %i0, 8, %i0
1457 blkd40:
1458         subcc   %i4, 8, %i4
1459         blu,pn  %ncc, blkdone
1460         faligndata %d40, %d42, %d48
1461         std     %d48, [%i0]
1462         add     %i0, 8, %i0
1463 blkd42:
1464         subcc   %i4, 8, %i4
1465         blu,pn  %ncc, blkdone
1466         faligndata %d42, %d44, %d48
1467         std     %d48, [%i0]
1468         add     %i0, 8, %i0
1469 blkd44:
1470         subcc   %i4, 8, %i4
1471         blu,pn  %ncc, blkdone
1472         faligndata %d44, %d46, %d48
1473         std     %d48, [%i0]
1474         add     %i0, 8, %i0
1475 blkd46:
1476         subcc   %i4, 8, %i4
1477         blu,pn  %ncc, blkdone
1478         fsrc1   %d46, %d0
1479 
1480 blkleft:
1481 1:
1482         ldd     [%l7], %d2
1483         add     %l7, 8, %l7
1484         subcc   %i4, 8, %i4
1485         faligndata %d0, %d2, %d8
1486         std     %d8, [%i0]
1487         blu,pn  %ncc, blkdone
1488         add     %i0, 8, %i0
1489         ldd     [%l7], %d0
1490         add     %l7, 8, %l7
1491         subcc   %i4, 8, %i4
1492         faligndata %d2, %d0, %d8
1493         std     %d8, [%i0]
1494         bgeu,pt %ncc, 1b
1495         add     %i0, 8, %i0
1496 
1497 blkdone:
1498         tst     %i2
1499         bz,pt   %ncc, .bcb_exit
1500         and     %l3, 0x4, %l3           ! fprs.du = fprs.dl = 0
1501 
1502 7:      ldub    [%i1], %i4
1503         inc     %i1
1504         inc     %i0
1505         deccc   %i2
1506         bgu,pt  %ncc, 7b
1507           stb     %i4, [%i0 - 1]
1508 
1509 .bcb_exit:
1510         membar  #StoreLoad|#StoreStore
1511         btst    FPUSED_FLAG, %l6
1512         bz      %icc, 1f
1513           and   %l6, COPY_FLAGS, %l1    ! Store flags in %l1
1514                                         ! We can't clear the flags from %l6 yet.
1515                                         ! If there's an error, .copyerr will
1516                                         ! need them
1517 
1518         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1519         wr      %o2, 0, %gsr
1520 
1521         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1522         btst    FPRS_FEF, %o3
1523         bz      %icc, 4f
1524           nop
1525 
1526         ! restore fpregs from stack
1527         membar  #Sync
1528         add     %fp, STACK_BIAS - 257, %o2
1529         and     %o2, -64, %o2
1530         ldda    [%o2]ASI_BLK_P, %d0
1531         add     %o2, 64, %o2
1532         ldda    [%o2]ASI_BLK_P, %d16
1533         add     %o2, 64, %o2
1534         ldda    [%o2]ASI_BLK_P, %d32
1535         add     %o2, 64, %o2
1536         ldda    [%o2]ASI_BLK_P, %d48
1537         membar  #Sync
1538 
1539         ba,pt   %ncc, 2f        
1540           wr    %o3, 0, %fprs           ! restore fprs
1541 
1542 4:
1543         FZERO                           ! zero all of the fpregs
1544         wr      %o3, 0, %fprs           ! restore fprs
1545 
1546 2:      ldn     [THREAD_REG + T_LWP], %o2
1547         tst     %o2
1548         bnz,pt  %ncc, 1f
1549           nop
1550 
1551         ldsb    [THREAD_REG + T_PREEMPT], %l0
1552         deccc   %l0
1553         bnz,pn  %ncc, 1f
1554           stb   %l0, [THREAD_REG + T_PREEMPT]
1555 
1556         ! Check for a kernel preemption request
1557         ldn     [THREAD_REG + T_CPU], %l0
1558         ldub    [%l0 + CPU_KPRUNRUN], %l0
1559         tst     %l0
1560         bnz,a,pt        %ncc, 1f        ! Need to call kpreempt?
1561           or    %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
1562 
1563 1:
1564         btst    BCOPY_FLAG, %l1
1565         bz,pn   %icc, 3f
1566           andncc        %l6, COPY_FLAGS, %l6
1567 
1568         !
1569         ! Here via bcopy. Check to see if the handler was NULL.
1570         ! If so, just return quietly. Otherwise, reset the
1571         ! handler and go home.
1572         ! 
1573         bnz,pn  %ncc, 3f
1574           nop
1575 
1576         !
1577         ! Null handler.  Check for kpreempt flag, call if necessary,
1578         ! then return.
1579         !
1580         btst    KPREEMPT_FLAG, %l1
1581         bz,pt   %icc, 2f
1582           nop
1583         call    kpreempt
1584           rdpr  %pil, %o0       ! pass %pil
1585 2:
1586         ret
1587           restore       %g0, 0, %o0
1588 
1589         !
1590         ! Here via kcopy or bcopy with a handler.Reset the
1591         ! fault handler.
1592         !
1593 3:
1594         membar  #Sync
1595         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1596 
1597         ! call kpreempt if necessary
1598         btst    KPREEMPT_FLAG, %l1
1599         bz,pt   %icc, 4f
1600           nop
1601         call    kpreempt
1602           rdpr  %pil, %o0
1603 4:
1604         ret
1605           restore       %g0, 0, %o0
1606 
1607 .bcb_punt:
1608         !
1609         ! use aligned transfers where possible
1610         !
1611         xor     %i0, %i1, %o4           ! xor from and to address
1612         btst    7, %o4                  ! if lower three bits zero
1613         bz      %icc, .aldoubcp         ! can align on double boundary
1614         .empty  ! assembler complaints about label
1615 
1616         xor     %i0, %i1, %o4           ! xor from and to address
1617         btst    3, %o4                  ! if lower two bits zero
1618         bz      %icc, .alwordcp         ! can align on word boundary
1619         btst    3, %i0                  ! delay slot, from address unaligned?
1620         !
1621         ! use aligned reads and writes where possible
1622         ! this differs from wordcp in that it copes
1623         ! with odd alignment between source and destnation
1624         ! using word reads and writes with the proper shifts
1625         ! in between to align transfers to and from memory
1626         ! i0 - src address, i1 - dest address, i2 - count
1627         ! i3, i4 - tmps for used generating complete word
1628         ! i5 (word to write)
1629         ! l0 size in bits of upper part of source word (US)
1630         ! l1 size in bits of lower part of source word (LS = 32 - US)
1631         ! l2 size in bits of upper part of destination word (UD)
1632         ! l3 size in bits of lower part of destination word (LD = 32 - UD)
1633         ! l4 number of bytes leftover after aligned transfers complete
1634         ! l5 the number 32
1635         !
1636         mov     32, %l5                 ! load an oft-needed constant
1637         bz      .align_dst_only
1638         btst    3, %i1                  ! is destnation address aligned?
1639         clr     %i4                     ! clear registers used in either case
1640         bz      %icc, .align_src_only
1641         clr     %l0
1642         !
1643         ! both source and destination addresses are unaligned
1644         !
1645 1:                                      ! align source
1646         ldub    [%i0], %i3              ! read a byte from source address
1647         add     %i0, 1, %i0             ! increment source address
1648         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
1649         btst    3, %i0                  ! is source aligned?
1650         add     %l0, 8, %l0             ! increment size of upper source (US)
1651         bnz,a   1b
1652         sll     %i4, 8, %i4             ! make room for next byte
1653 
1654         sub     %l5, %l0, %l1           ! generate shift left count (LS)
1655         sll     %i4, %l1, %i4           ! prepare to get rest
1656         ld      [%i0], %i3              ! read a word
1657         add     %i0, 4, %i0             ! increment source address
1658         srl     %i3, %l0, %i5           ! upper src bits into lower dst bits
1659         or      %i4, %i5, %i5           ! merge
1660         mov     24, %l3                 ! align destination
1661 1:
1662         srl     %i5, %l3, %i4           ! prepare to write a single byte
1663         stb     %i4, [%i1]              ! write a byte
1664         add     %i1, 1, %i1             ! increment destination address
1665         sub     %i2, 1, %i2             ! decrement count
1666         btst    3, %i1                  ! is destination aligned?
1667         bnz,a   1b
1668         sub     %l3, 8, %l3             ! delay slot, decrement shift count (LD)
1669         sub     %l5, %l3, %l2           ! generate shift left count (UD)
1670         sll     %i5, %l2, %i5           ! move leftover into upper bytes
1671         cmp     %l2, %l0                ! cmp # reqd to fill dst w old src left
1672         bgu     %ncc, .more_needed      ! need more to fill than we have
1673         nop
1674 
1675         sll     %i3, %l1, %i3           ! clear upper used byte(s)
1676         srl     %i3, %l1, %i3
1677         ! get the odd bytes between alignments
1678         sub     %l0, %l2, %l0           ! regenerate shift count
1679         sub     %l5, %l0, %l1           ! generate new shift left count (LS)
1680         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
1681         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
1682         srl     %i3, %l0, %i4
1683         or      %i5, %i4, %i5
1684         st      %i5, [%i1]              ! write a word
1685         subcc   %i2, 4, %i2             ! decrement count
1686         bz      %ncc, .unalign_out
1687         add     %i1, 4, %i1             ! increment destination address
1688 
1689         b       2f
1690         sll     %i3, %l1, %i5           ! get leftover into upper bits
1691 .more_needed:
1692         sll     %i3, %l0, %i3           ! save remaining byte(s)
1693         srl     %i3, %l0, %i3
1694         sub     %l2, %l0, %l1           ! regenerate shift count
1695         sub     %l5, %l1, %l0           ! generate new shift left count
1696         sll     %i3, %l1, %i4           ! move to fill empty space
1697         b       3f
1698         or      %i5, %i4, %i5           ! merge to complete word
1699         !
1700         ! the source address is aligned and destination is not
1701         !
1702 .align_dst_only:
1703         ld      [%i0], %i4              ! read a word
1704         add     %i0, 4, %i0             ! increment source address
1705         mov     24, %l0                 ! initial shift alignment count
1706 1:
1707         srl     %i4, %l0, %i3           ! prepare to write a single byte
1708         stb     %i3, [%i1]              ! write a byte
1709         add     %i1, 1, %i1             ! increment destination address
1710         sub     %i2, 1, %i2             ! decrement count
1711         btst    3, %i1                  ! is destination aligned?
1712         bnz,a   1b
1713         sub     %l0, 8, %l0             ! delay slot, decrement shift count
1714 .xfer:
1715         sub     %l5, %l0, %l1           ! generate shift left count
1716         sll     %i4, %l1, %i5           ! get leftover
1717 3:
1718         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
1719         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
1720 2:
1721         ld      [%i0], %i3              ! read a source word
1722         add     %i0, 4, %i0             ! increment source address
1723         srl     %i3, %l0, %i4           ! upper src bits into lower dst bits
1724         or      %i5, %i4, %i5           ! merge with upper dest bits (leftover)
1725         st      %i5, [%i1]              ! write a destination word
1726         subcc   %i2, 4, %i2             ! decrement count
1727         bz      %ncc, .unalign_out      ! check if done
1728         add     %i1, 4, %i1             ! increment destination address
1729         b       2b                      ! loop
1730         sll     %i3, %l1, %i5           ! get leftover
1731 .unalign_out:
1732         tst     %l4                     ! any bytes leftover?
1733         bz      %ncc, .cpdone
1734         .empty                          ! allow next instruction in delay slot
1735 1:
1736         sub     %l0, 8, %l0             ! decrement shift
1737         srl     %i3, %l0, %i4           ! upper src byte into lower dst byte
1738         stb     %i4, [%i1]              ! write a byte
1739         subcc   %l4, 1, %l4             ! decrement count
1740         bz      %ncc, .cpdone           ! done?
1741         add     %i1, 1, %i1             ! increment destination
1742         tst     %l0                     ! any more previously read bytes
1743         bnz     %ncc, 1b                ! we have leftover bytes
1744         mov     %l4, %i2                ! delay slot, mv cnt where dbytecp wants
1745         b       .dbytecp                ! let dbytecp do the rest
1746         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
1747         !
1748         ! the destination address is aligned and the source is not
1749         !
1750 .align_src_only:
1751         ldub    [%i0], %i3              ! read a byte from source address
1752         add     %i0, 1, %i0             ! increment source address
1753         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
1754         btst    3, %i0                  ! is source aligned?
1755         add     %l0, 8, %l0             ! increment shift count (US)
1756         bnz,a   .align_src_only
1757         sll     %i4, 8, %i4             ! make room for next byte
1758         b,a     .xfer
1759         !
1760         ! if from address unaligned for double-word moves,
1761         ! move bytes till it is, if count is < 56 it could take
1762         ! longer to align the thing than to do the transfer
1763         ! in word size chunks right away
1764         !
1765 .aldoubcp:
1766         cmp     %i2, 56                 ! if count < 56, use wordcp, it takes
1767         blu,a   %ncc, .alwordcp         ! longer to align doubles than words
1768         mov     3, %o0                  ! mask for word alignment
1769         call    .alignit                ! copy bytes until aligned
1770         mov     7, %o0                  ! mask for double alignment
1771         !
1772         ! source and destination are now double-word aligned
1773         ! i3 has aligned count returned by alignit
1774         !
1775         and     %i2, 7, %i2             ! unaligned leftover count
1776         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
1777 5:
1778         ldx     [%i0+%i1], %o4          ! read from address
1779         stx     %o4, [%i1]              ! write at destination address
1780         subcc   %i3, 8, %i3             ! dec count
1781         bgu     %ncc, 5b
1782         add     %i1, 8, %i1             ! delay slot, inc to address
1783         cmp     %i2, 4                  ! see if we can copy a word
1784         blu     %ncc, .dbytecp          ! if 3 or less bytes use bytecp
1785         .empty
1786         !
1787         ! for leftover bytes we fall into wordcp, if needed
1788         !
1789 .wordcp:
1790         and     %i2, 3, %i2             ! unaligned leftover count
1791 5:
1792         ld      [%i0+%i1], %o4          ! read from address
1793         st      %o4, [%i1]              ! write at destination address
1794         subcc   %i3, 4, %i3             ! dec count
1795         bgu     %ncc, 5b
1796         add     %i1, 4, %i1             ! delay slot, inc to address
1797         b,a     .dbytecp
1798 
1799         ! we come here to align copies on word boundaries
1800 .alwordcp:
1801         call    .alignit                ! go word-align it
1802         mov     3, %o0                  ! bits that must be zero to be aligned
1803         b       .wordcp
1804         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
1805 
1806         !
1807         ! byte copy, works with any alignment
1808         !
1809 .bytecp:
1810         b       .dbytecp
1811         sub     %i0, %i1, %i0           ! i0 gets difference of src and dst
1812 
1813         !
1814         ! differenced byte copy, works with any alignment
1815         ! assumes dest in %i1 and (source - dest) in %i0
1816         !
1817 1:
1818         stb     %o4, [%i1]              ! write to address
1819         inc     %i1                     ! inc to address
1820 .dbytecp:
1821         deccc   %i2                     ! dec count
1822         bgeu,a  %ncc, 1b                ! loop till done
1823         ldub    [%i0+%i1], %o4          ! read from address
1824         !
1825         ! FPUSED_FLAG will not have been set in any path leading to
1826         ! this point. No need to deal with it.
1827         !
1828 .cpdone:
1829         btst    BCOPY_FLAG, %l6
1830         bz,pn   %icc, 2f
1831         andncc  %l6, BCOPY_FLAG, %l6
1832         !
1833         ! Here via bcopy. Check to see if the handler was NULL.
1834         ! If so, just return quietly. Otherwise, reset the
1835         ! handler and go home.
1836         !
1837         bnz,pn  %ncc, 2f
1838         nop
1839         !
1840         ! Null handler.
1841         !
1842         ret
1843         restore %g0, 0, %o0
1844         !
1845         ! Here via kcopy or bcopy with a handler.Reset the
1846         ! fault handler.
1847         !
1848 2:
1849         membar  #Sync
1850         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1851         ret
1852         restore %g0, 0, %o0             ! return (0)
1853 
1854 /*
1855  * Common code used to align transfers on word and doubleword
1856  * boudaries.  Aligns source and destination and returns a count
1857  * of aligned bytes to transfer in %i3
1858  */
1859 1:
1860         inc     %i0                     ! inc from
1861         stb     %o4, [%i1]              ! write a byte
1862         inc     %i1                     ! inc to
1863         dec     %i2                     ! dec count
1864 .alignit:
1865         btst    %o0, %i0                ! %o0 is bit mask to check for alignment
1866         bnz,a   1b
1867         ldub    [%i0], %o4              ! read next byte
1868 
1869         retl
1870         andn    %i2, %o0, %i3           ! return size of aligned bytes
1871         SET_SIZE(bcopy)
1872 
1873 #endif  /* lint */
1874 
1875 /*
1876  * Block copy with possibly overlapped operands.
1877  */
1878 
1879 #if defined(lint)
1880 
1881 /*ARGSUSED*/
1882 void
1883 ovbcopy(const void *from, void *to, size_t count)
1884 {}
1885 
1886 #else   /* lint */
1887 
1888         ENTRY(ovbcopy)
1889         tst     %o2                     ! check count
1890         bgu,a   %ncc, 1f                ! nothing to do or bad arguments
1891         subcc   %o0, %o1, %o3           ! difference of from and to address
1892 
1893         retl                            ! return
1894         nop
1895 1:
1896         bneg,a  %ncc, 2f
1897         neg     %o3                     ! if < 0, make it positive
1898 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
1899         bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
1900         .empty                          !   no overlap
1901         cmp     %o0, %o1                ! compare from and to addresses
1902         blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
1903         nop
1904         !
1905         ! Copy forwards.
1906         !
1907 .ov_fwd:
1908         ldub    [%o0], %o3              ! read from address
1909         inc     %o0                     ! inc from address
1910         stb     %o3, [%o1]              ! write to address
1911         deccc   %o2                     ! dec count
1912         bgu     %ncc, .ov_fwd           ! loop till done
1913         inc     %o1                     ! inc to address
1914 
1915         retl                            ! return
1916         nop
1917         !
1918         ! Copy backwards.
1919         !
1920 .ov_bkwd:
1921         deccc   %o2                     ! dec count
1922         ldub    [%o0 + %o2], %o3        ! get byte at end of src
1923         bgu     %ncc, .ov_bkwd          ! loop till done
1924         stb     %o3, [%o1 + %o2]        ! delay slot, store at end of dst
1925 
1926         retl                            ! return
1927         nop
1928         SET_SIZE(ovbcopy)
1929 
1930 #endif  /* lint */
1931 
1932 /*
1933  * hwblkpagecopy()
1934  *
1935  * Copies exactly one page.  This routine assumes the caller (ppcopy)
1936  * has already disabled kernel preemption and has checked
1937  * use_hw_bcopy.
1938  */
1939 #ifdef lint
1940 /*ARGSUSED*/
1941 void
1942 hwblkpagecopy(const void *src, void *dst)
1943 { }
1944 #else /* lint */
1945         ENTRY(hwblkpagecopy)
1946         ! get another window w/space for three aligned blocks of saved fpregs
1947         save    %sp, -SA(MINFRAME + 4*64), %sp
1948 
1949         ! %i0 - source address (arg)
1950         ! %i1 - destination address (arg)
1951         ! %i2 - length of region (not arg)
1952         ! %l0 - saved fprs
1953         ! %l1 - pointer to saved fpregs
1954 
1955         rd      %fprs, %l0              ! check for unused fp
1956         btst    FPRS_FEF, %l0
1957         bz      1f
1958         membar  #Sync
1959 
1960         ! save in-use fpregs on stack
1961         add     %fp, STACK_BIAS - 193, %l1
1962         and     %l1, -64, %l1
1963         stda    %d0, [%l1]ASI_BLK_P
1964         add     %l1, 64, %l3
1965         stda    %d16, [%l3]ASI_BLK_P
1966         add     %l3, 64, %l3
1967         stda    %d32, [%l3]ASI_BLK_P
1968         membar  #Sync
1969 
1970 1:      wr      %g0, FPRS_FEF, %fprs
1971         ldda    [%i0]ASI_BLK_P, %d0
1972         add     %i0, 64, %i0
1973         set     PAGESIZE - 64, %i2
1974 
1975 2:      ldda    [%i0]ASI_BLK_P, %d16
1976         fsrc1   %d0, %d32
1977         fsrc1   %d2, %d34
1978         fsrc1   %d4, %d36
1979         fsrc1   %d6, %d38
1980         fsrc1   %d8, %d40
1981         fsrc1   %d10, %d42
1982         fsrc1   %d12, %d44
1983         fsrc1   %d14, %d46
1984         stda    %d32, [%i1]ASI_BLK_P
1985         add     %i0, 64, %i0
1986         subcc   %i2, 64, %i2
1987         bz,pn   %ncc, 3f
1988         add     %i1, 64, %i1
1989         ldda    [%i0]ASI_BLK_P, %d0
1990         fsrc1   %d16, %d32
1991         fsrc1   %d18, %d34
1992         fsrc1   %d20, %d36
1993         fsrc1   %d22, %d38
1994         fsrc1   %d24, %d40
1995         fsrc1   %d26, %d42
1996         fsrc1   %d28, %d44
1997         fsrc1   %d30, %d46
1998         stda    %d32, [%i1]ASI_BLK_P
1999         add     %i0, 64, %i0
2000         sub     %i2, 64, %i2
2001         ba,pt   %ncc, 2b
2002         add     %i1, 64, %i1
2003 
2004 3:      membar  #Sync
2005         btst    FPRS_FEF, %l0
2006         bz      4f
2007         stda    %d16, [%i1]ASI_BLK_P
2008 
2009         ! restore fpregs from stack
2010         membar  #Sync
2011         ldda    [%l1]ASI_BLK_P, %d0
2012         add     %l1, 64, %l3
2013         ldda    [%l3]ASI_BLK_P, %d16
2014         add     %l3, 64, %l3
2015         ldda    [%l3]ASI_BLK_P, %d32
2016 
2017 4:      wr      %l0, 0, %fprs           ! restore fprs
2018         membar #Sync
2019         ret
2020         restore %g0, 0, %o0
2021         SET_SIZE(hwblkpagecopy)
2022 #endif  /* lint */
2023 
2024 
2025 /*
2026  * Transfer data to and from user space -
2027  * Note that these routines can cause faults
2028  * It is assumed that the kernel has nothing at
2029  * less than KERNELBASE in the virtual address space.
2030  *
2031  * Note that copyin(9F) and copyout(9F) are part of the
2032  * DDI/DKI which specifies that they return '-1' on "errors."
2033  *
2034  * Sigh.
2035  *
2036  * So there's two extremely similar routines - xcopyin() and xcopyout()
2037  * which return the errno that we've faithfully computed.  This
2038  * allows other callers (e.g. uiomove(9F)) to work correctly.
2039  * Given that these are used pretty heavily, we expand the calling
2040  * sequences inline for all flavours (rather than making wrappers).
2041  *
2042  * There are also stub routines for xcopyout_little and xcopyin_little,
2043  * which currently are intended to handle requests of <= 16 bytes from
2044  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2045  * is left as an exercise...
2046  */
2047 
2048 /*
2049  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2050  *
2051  * General theory of operation:
2052  *
2053  * The only difference between default_copy{in,out} and
2054  * default_xcopy{in,out} is in the error handling routine they invoke
2055  * when a memory access error is seen. default_xcopyOP returns the errno
2056  * while default_copyOP returns -1 (see above). copy{in,out}_noerr set
2057  * a special flag (by oring the value 2 into the fault handler address)
2058  * if they are called with a fault handler already in place. That flag
2059  * causes the default handlers to trampoline to the previous handler
2060  * upon an error.
2061  *
2062  * None of the copyops routines grab a window until it's decided that
2063  * we need to do a HW block copy operation. This saves a window
2064  * spill/fill when we're called during socket ops. The typical IO
2065  * path won't cause spill/fill traps.
2066  *
2067  * This code uses a set of 4 limits for the maximum size that will
2068  * be copied given a particular input/output address alignment.
2069  * the default limits are:
2070  *
2071  * single byte aligned - 900 (hw_copy_limit_1)
2072  * two byte aligned - 1800 (hw_copy_limit_2)
2073  * four byte aligned - 3600 (hw_copy_limit_4)
2074  * eight byte aligned - 7200 (hw_copy_limit_8)
2075  *
2076  * If the value for a particular limit is zero, the copy will be done
2077  * via the copy loops rather than VIS.
2078  *
2079  * Flow:
2080  *
2081  * If count == zero return zero.
2082  *
2083  * Store the previous lo_fault handler into %g6.
2084  * Place our secondary lofault handler into %g5.
2085  * Place the address of our nowindow fault handler into %o3.
2086  * Place the address of the windowed fault handler into %o4.
2087  * --> We'll use this handler if we end up grabbing a window
2088  * --> before we use VIS instructions.
2089  *
2090  * If count is less than or equal to SMALL_LIMIT (7) we
2091  * always do a byte for byte copy.
2092  *
2093  * If count is > SMALL_LIMIT, we check the alignment of the input
2094  * and output pointers. Based on the alignment we check count
2095  * against a soft limit of VIS_COPY_THRESHOLD (900 on spitfire). If
2096  * we're larger than VIS_COPY_THRESHOLD, we check against a limit based
2097  * on detected alignment. If we exceed the alignment value we copy
2098  * via VIS instructions.
2099  *
2100  * If we don't exceed one of the limits, we store -count in %o3,
2101  * we store the number of chunks (8, 4, 2 or 1 byte) operated
2102  * on in our basic copy loop in %o2. Following this we branch 
2103  * to the appropriate copy loop and copy that many chunks.
2104  * Since we've been adding the chunk size to %o3 each time through
2105  * as well as decrementing %o2, we can tell if any data is
2106  * is left to be copied by examining %o3. If that is zero, we're
2107  * done and can go home. If not, we figure out what the largest
2108  * chunk size left to be copied is and branch to that copy loop
2109  * unless there's only one byte left. We load that as we're
2110  * branching to code that stores it just before we return.
2111  *
2112  * There is one potential situation in which we start to do a VIS
2113  * copy but decide to punt and return to the copy loops. There is
2114  * (in the default configuration) a window of 256 bytes between
2115  * the single byte aligned copy limit and what VIS treats as its
2116  * minimum if floating point is in use in the calling app. We need
2117  * to be prepared to handle this. See the .small_copyOP label for
2118  * details.
2119  *
2120  * Fault handlers are invoked if we reference memory that has no
2121  * current mapping.  All forms share the same copyio_fault handler.
2122  * This routine handles fixing up the stack and general housecleaning.
2123  * Each copy operation has a simple fault handler that is then called
2124  * to do the work specific to the invidual operation.  The handlers
2125  * for default_copyOP and copyOP_noerr are found at the end of
2126  * default_copyout. The handlers for default_xcopyOP are found at the
2127  * end of xdefault_copyin.
2128  */
2129 
2130 /*
2131  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2132  */
2133 
2134 #if defined(lint)
2135 
2136 /*ARGSUSED*/
2137 int
2138 copyout(const void *kaddr, void *uaddr, size_t count)
2139 { return (0); }
2140 
2141 #else   /* lint */
2142 
2143 /*
2144  * We save the arguments in the following registers in case of a fault:
2145  *      kaddr - %g2
2146  *      uaddr - %g3
2147  *      count - %g4
2148  */
2149 #define SAVE_SRC        %g2
2150 #define SAVE_DST        %g3
2151 #define SAVE_COUNT      %g4
2152 
2153 #define REAL_LOFAULT            %g5
2154 #define SAVED_LOFAULT           %g6
2155 
2156 /*
2157  * Generic copyio fault handler.  This is the first line of defense when a 
2158  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2159  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2160  * This allows us to share common code for all the flavors of the copy
2161  * operations, including the _noerr versions.
2162  *
2163  * Note that this function will restore the original input parameters before
2164  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2165  * member of the t_copyop structure, if needed.
2166  */
2167         ENTRY(copyio_fault)
2168         btst    FPUSED_FLAG, SAVED_LOFAULT
2169         bz      1f
2170           andn  SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2171 
2172         membar  #Sync
2173 
2174         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2175         wr      %o2, 0, %gsr            ! restore gsr
2176 
2177         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2178         btst    FPRS_FEF, %o3
2179         bz      4f
2180           nop
2181 
2182         ! restore fpregs from stack
2183         membar  #Sync
2184         add     %fp, STACK_BIAS - 257, %o2
2185         and     %o2, -64, %o2
2186         ldda    [%o2]ASI_BLK_P, %d0
2187         add     %o2, 64, %o2
2188         ldda    [%o2]ASI_BLK_P, %d16
2189         add     %o2, 64, %o2
2190         ldda    [%o2]ASI_BLK_P, %d32
2191         add     %o2, 64, %o2
2192         ldda    [%o2]ASI_BLK_P, %d48
2193         membar  #Sync
2194 
2195         ba,pt   %ncc, 1f
2196           wr    %o3, 0, %fprs           ! restore fprs
2197 
2198 4:
2199         FZERO                           ! zero all of the fpregs
2200         wr      %o3, 0, %fprs           ! restore fprs
2201 
2202 1:
2203 
2204         restore
2205 
2206         mov     SAVE_SRC, %o0
2207         mov     SAVE_DST, %o1
2208         jmp     REAL_LOFAULT
2209           mov   SAVE_COUNT, %o2
2210         SET_SIZE(copyio_fault)
2211 
2212         ENTRY(copyio_fault_nowindow)
2213         membar  #Sync
2214         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2215 
2216         mov     SAVE_SRC, %o0
2217         mov     SAVE_DST, %o1
2218         jmp     REAL_LOFAULT
2219           mov   SAVE_COUNT, %o2
2220         SET_SIZE(copyio_fault_nowindow)
2221 
2222         ENTRY(copyout)
2223         sethi   %hi(.copyout_err), REAL_LOFAULT
2224         or      REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2225 
2226 .do_copyout:
2227         !
2228         ! Check the length and bail if zero.
2229         !
2230         tst     %o2
2231         bnz,pt  %ncc, 1f
2232           nop
2233         retl
2234           clr   %o0
2235 1:
2236         sethi   %hi(copyio_fault), %o4
2237         or      %o4, %lo(copyio_fault), %o4
2238         sethi   %hi(copyio_fault_nowindow), %o3
2239         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2240         or      %o3, %lo(copyio_fault_nowindow), %o3
2241         membar  #Sync
2242         stn     %o3, [THREAD_REG + T_LOFAULT]
2243 
2244         mov     %o0, SAVE_SRC
2245         mov     %o1, SAVE_DST
2246         mov     %o2, SAVE_COUNT
2247 
2248         !
2249         ! Check to see if we're more than SMALL_LIMIT (7 bytes).
2250         ! Run in leaf mode, using the %o regs as our input regs.
2251         !
2252         subcc   %o2, SMALL_LIMIT, %o3
2253         bgu,a,pt %ncc, .dco_ns
2254         or      %o0, %o1, %o3
2255         !
2256         ! What was previously ".small_copyout"
2257         ! Do full differenced copy.
2258         !
2259 .dcobcp:
2260         sub     %g0, %o2, %o3           ! negate count
2261         add     %o0, %o2, %o0           ! make %o0 point at the end
2262         add     %o1, %o2, %o1           ! make %o1 point at the end
2263         ba,pt   %ncc, .dcocl
2264         ldub    [%o0 + %o3], %o4        ! load first byte
2265         !
2266         ! %o0 and %o2 point at the end and remain pointing at the end
2267         ! of their buffers. We pull things out by adding %o3 (which is
2268         ! the negation of the length) to the buffer end which gives us
2269         ! the curent location in the buffers. By incrementing %o3 we walk
2270         ! through both buffers without having to bump each buffer's
2271         ! pointer. A very fast 4 instruction loop.
2272         !
2273         .align 16
2274 .dcocl:
2275         stba    %o4, [%o1 + %o3]ASI_USER
2276         inccc   %o3
2277         bl,a,pt %ncc, .dcocl
2278         ldub    [%o0 + %o3], %o4
2279         !
2280         ! We're done. Go home.
2281         !
2282         membar  #Sync
2283         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
2284         retl
2285         clr     %o0
2286         !
2287         ! Try aligned copies from here.
2288         !
2289 .dco_ns:
2290         ! %o0 = kernel addr (to be copied from)
2291         ! %o1 = user addr (to be copied to)
2292         ! %o2 = length
2293         ! %o3 = %o1 | %o2 (used for alignment checking)
2294         ! %o4 is alternate lo_fault
2295         ! %o5 is original lo_fault
2296         !
2297         ! See if we're single byte aligned. If we are, check the
2298         ! limit for single byte copies. If we're smaller or equal,
2299         ! bounce to the byte for byte copy loop. Otherwise do it in
2300         ! HW (if enabled).
2301         !
2302         btst    1, %o3
2303         bz,pt   %icc, .dcoh8
2304         btst    7, %o3
2305         !
2306         ! Single byte aligned. Do we do it via HW or via
2307         ! byte for byte? Do a quick no memory reference
2308         ! check to pick up small copies.
2309         !
2310         subcc   %o2, VIS_COPY_THRESHOLD, %o3
2311         bleu,pt %ncc, .dcobcp
2312         sethi   %hi(hw_copy_limit_1), %o3
2313         !
2314         ! Big enough that we need to check the HW limit for
2315         ! this size copy.
2316         !
2317         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2318         !
2319         ! Is HW copy on? If not, do everything byte for byte.
2320         !
2321         tst     %o3
2322         bz,pn   %icc, .dcobcp
2323         subcc   %o3, %o2, %o3
2324         !
2325         ! If we're less than or equal to the single byte copy limit,
2326         ! bop to the copy loop.
2327         !
2328         bge,pt  %ncc, .dcobcp
2329         nop
2330         !
2331         ! We're big enough and copy is on. Do it with HW.
2332         !
2333         ba,pt   %ncc, .big_copyout
2334         nop
2335 .dcoh8:
2336         !
2337         ! 8 byte aligned?
2338         !
2339         bnz,a   %ncc, .dcoh4
2340         btst    3, %o3
2341         !
2342         ! See if we're in the "small range".
2343         ! If so, go off and do the copy.
2344         ! If not, load the hard limit. %o3 is
2345         ! available for reuse.
2346         !
2347         subcc   %o2, VIS_COPY_THRESHOLD, %o3
2348         bleu,pt %ncc, .dcos8
2349         sethi   %hi(hw_copy_limit_8), %o3
2350         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2351         !
2352         ! If it's zero, there's no HW bcopy.
2353         ! Bop off to the aligned copy.
2354         !
2355         tst     %o3
2356         bz,pn   %icc, .dcos8
2357         subcc   %o3, %o2, %o3
2358         !
2359         ! We're negative if our size is larger than hw_copy_limit_8.
2360         !
2361         bge,pt  %ncc, .dcos8
2362         nop
2363         !
2364         ! HW assist is on and we're large enough. Do it.
2365         !
2366         ba,pt   %ncc, .big_copyout
2367         nop
2368 .dcos8:
2369         !
2370         ! Housekeeping for copy loops. Uses same idea as in the byte for
2371         ! byte copy loop above.
2372         !
2373         add     %o0, %o2, %o0
2374         add     %o1, %o2, %o1
2375         sub     %g0, %o2, %o3
2376         ba,pt   %ncc, .dodebc
2377         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
2378         !
2379         ! 4 byte aligned?
2380         !
2381 .dcoh4:
2382         bnz,pn  %ncc, .dcoh2
2383         !
2384         ! See if we're in the "small range".
2385         ! If so, go off an do the copy.
2386         ! If not, load the hard limit. %o3 is
2387         ! available for reuse.
2388         !
2389         subcc   %o2, VIS_COPY_THRESHOLD, %o3
2390         bleu,pt %ncc, .dcos4
2391         sethi   %hi(hw_copy_limit_4), %o3
2392         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2393         !
2394         ! If it's zero, there's no HW bcopy.
2395         ! Bop off to the aligned copy.
2396         !
2397         tst     %o3
2398         bz,pn   %icc, .dcos4
2399         subcc   %o3, %o2, %o3
2400         !
2401         ! We're negative if our size is larger than hw_copy_limit_4.
2402         !
2403         bge,pt  %ncc, .dcos4
2404         nop
2405         !
2406         ! HW assist is on and we're large enough. Do it.
2407         !
2408         ba,pt   %ncc, .big_copyout
2409         nop
2410 .dcos4:
2411         add     %o0, %o2, %o0
2412         add     %o1, %o2, %o1
2413         sub     %g0, %o2, %o3
2414         ba,pt   %ncc, .dodfbc
2415         srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
2416         !
2417         ! We must be 2 byte aligned. Off we go.
2418         ! The check for small copies was done in the
2419         ! delay at .dcoh4
2420         !
2421 .dcoh2:
2422         ble     %ncc, .dcos2
2423         sethi   %hi(hw_copy_limit_2), %o3
2424         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2425         tst     %o3
2426         bz,pn   %icc, .dcos2
2427         subcc   %o3, %o2, %o3
2428         bge,pt  %ncc, .dcos2
2429         nop
2430         !
2431         ! HW is on and we're big enough. Do it.
2432         !
2433         ba,pt   %ncc, .big_copyout
2434         nop
2435 .dcos2:
2436         add     %o0, %o2, %o0
2437         add     %o1, %o2, %o1
2438         sub     %g0, %o2, %o3
2439         ba,pt   %ncc, .dodtbc
2440         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
2441 .small_copyout:
2442         !
2443         ! Why are we doing this AGAIN? There are certain conditions in
2444         ! big_copyout that will cause us to forego the HW assisted copies
2445         ! and bounce back to a non-HW assisted copy. This dispatches those
2446         ! copies. Note that we branch around this in the main line code.
2447         !
2448         ! We make no check for limits or HW enablement here. We've
2449         ! already been told that we're a poster child so just go off
2450         ! and do it.
2451         !
2452         or      %o0, %o1, %o3
2453         btst    1, %o3
2454         bnz     %icc, .dcobcp           ! Most likely
2455         btst    7, %o3
2456         bz      %icc, .dcos8
2457         btst    3, %o3
2458         bz      %icc, .dcos4
2459         nop
2460         ba,pt   %ncc, .dcos2
2461         nop
2462         .align 32
2463 .dodebc:
2464         ldx     [%o0 + %o3], %o4
2465         deccc   %o2
2466         stxa    %o4, [%o1 + %o3]ASI_USER
2467         bg,pt   %ncc, .dodebc
2468         addcc   %o3, 8, %o3
2469         !
2470         ! End of copy loop. Check to see if we're done. Most
2471         ! eight byte aligned copies end here.
2472         !
2473         bz,pt   %ncc, .dcofh
2474         nop
2475         !
2476         ! Something is left - do it byte for byte.
2477         ! 
2478         ba,pt   %ncc, .dcocl
2479         ldub    [%o0 + %o3], %o4        ! load next byte
2480         !
2481         ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
2482         !
2483         .align 32
2484 .dodfbc:
2485         lduw    [%o0 + %o3], %o4
2486         deccc   %o2
2487         sta     %o4, [%o1 + %o3]ASI_USER
2488         bg,pt   %ncc, .dodfbc
2489         addcc   %o3, 4, %o3
2490         !
2491         ! End of copy loop. Check to see if we're done. Most
2492         ! four byte aligned copies end here.
2493         !
2494         bz,pt   %ncc, .dcofh
2495         nop
2496         !
2497         ! Something is left. Do it byte for byte.
2498         !
2499         ba,pt   %ncc, .dcocl
2500         ldub    [%o0 + %o3], %o4        ! load next byte
2501         !
2502         ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
2503         ! copy.
2504         !
2505         .align 32
2506 .dodtbc:
2507         lduh    [%o0 + %o3], %o4
2508         deccc   %o2
2509         stha    %o4, [%o1 + %o3]ASI_USER
2510         bg,pt   %ncc, .dodtbc
2511         addcc   %o3, 2, %o3
2512         !
2513         ! End of copy loop. Anything left?
2514         !
2515         bz,pt   %ncc, .dcofh
2516         nop
2517         !
2518         ! Deal with the last byte
2519         !
2520         ldub    [%o0 + %o3], %o4
2521         stba    %o4, [%o1 + %o3]ASI_USER
2522 .dcofh:
2523         membar  #Sync
2524         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2525         retl
2526         clr     %o0
2527 
2528 .big_copyout:
2529         !
2530         ! Are we using the FP registers?
2531         !
2532         rd      %fprs, %o3                      ! check for unused fp
2533         btst    FPRS_FEF, %o3
2534         bnz     %icc, .copyout_fpregs_inuse
2535         nop
2536         !
2537         ! We're going to go off and do a block copy.
2538         ! Switch fault hendlers and grab a window. We
2539         ! don't do a membar #Sync since we've done only
2540         ! kernel data to this point.
2541         !
2542         stn     %o4, [THREAD_REG + T_LOFAULT]
2543         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2544         !
2545         ! %o3 is now %i3. Save original %fprs.
2546         !
2547         st      %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2548         ba,pt   %ncc, .do_block_copyout         ! Not in use. Go off and do it.
2549         wr      %g0, FPRS_FEF, %fprs            ! clear %fprs
2550         !
2551 .copyout_fpregs_inuse:
2552         !
2553         ! We're here if the FP regs are in use. Need to see if the request
2554         ! exceeds our suddenly larger minimum.
2555         !
2556         cmp     %i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
2557         bl      %ncc, .small_copyout
2558           nop
2559         !
2560         ! We're going to go off and do a block copy.
2561         ! Change to the heavy duty fault handler and grab a window first.
2562         !
2563         stn     %o4, [THREAD_REG + T_LOFAULT]
2564         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2565         st      %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2566         !
2567         ! save in-use fpregs on stack
2568         !
2569         wr      %g0, FPRS_FEF, %fprs
2570         membar  #Sync
2571         add     %fp, STACK_BIAS - 257, %o2
2572         and     %o2, -64, %o2
2573         stda    %d0, [%o2]ASI_BLK_P
2574         add     %o2, 64, %o2
2575         stda    %d16, [%o2]ASI_BLK_P
2576         add     %o2, 64, %o2
2577         stda    %d32, [%o2]ASI_BLK_P
2578         add     %o2, 64, %o2
2579         stda    %d48, [%o2]ASI_BLK_P
2580         membar  #Sync
2581 
2582 .do_block_copyout:
2583         membar  #StoreStore|#StoreLoad|#LoadStore
2584 
2585         rd      %gsr, %o2
2586         st      %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2587 
2588         ! Set the lower bit in the saved t_lofault to indicate
2589         ! that we need to clear the %fprs register on the way
2590         ! out
2591         or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 
2592 
2593         ! Swap src/dst since the code below is memcpy code
2594         ! and memcpy/bcopy have different calling sequences
2595         mov     %i1, %i5
2596         mov     %i0, %i1
2597         mov     %i5, %i0
2598 
2599 !!! This code is nearly identical to the version in the sun4u
2600 !!! libc_psr.  Most bugfixes made to that file should be
2601 !!! merged into this routine.
2602 
2603         andcc   %i0, 7, %o3
2604         bz      %ncc, copyout_blkcpy
2605         sub     %o3, 8, %o3
2606         neg     %o3
2607         sub     %i2, %o3, %i2
2608 
2609         ! Align Destination on double-word boundary
2610 
2611 2:      ldub    [%i1], %o4
2612         inc     %i1
2613         stba    %o4, [%i0]ASI_USER
2614         deccc   %o3
2615         bgu     %ncc, 2b
2616           inc   %i0
2617 copyout_blkcpy:
2618         andcc   %i0, 63, %i3
2619         bz,pn   %ncc, copyout_blalign   ! now block aligned
2620         sub     %i3, 64, %i3
2621         neg     %i3                     ! bytes till block aligned
2622         sub     %i2, %i3, %i2           ! update %i2 with new count
2623 
2624         ! Copy %i3 bytes till dst is block (64 byte) aligned. use
2625         ! double word copies.
2626 
2627         alignaddr %i1, %g0, %g1
2628         ldd     [%g1], %d0
2629         add     %g1, 8, %g1
2630 6:
2631         ldd     [%g1], %d2
2632         add     %g1, 8, %g1
2633         subcc   %i3, 8, %i3
2634         faligndata %d0, %d2, %d8
2635         stda     %d8, [%i0]ASI_USER
2636         add     %i1, 8, %i1
2637         bz,pn   %ncc, copyout_blalign
2638         add     %i0, 8, %i0
2639         ldd     [%g1], %d0
2640         add     %g1, 8, %g1
2641         subcc   %i3, 8, %i3
2642         faligndata %d2, %d0, %d8
2643         stda     %d8, [%i0]ASI_USER
2644         add     %i1, 8, %i1
2645         bgu,pn  %ncc, 6b
2646         add     %i0, 8, %i0
2647  
2648 copyout_blalign:
2649         membar  #StoreLoad
2650         ! %i2 = total length
2651         ! %i3 = blocks  (length - 64) / 64
2652         ! %i4 = doubles remaining  (length - blocks)
2653         sub     %i2, 64, %i3
2654         andn    %i3, 63, %i3
2655         sub     %i2, %i3, %i4
2656         andn    %i4, 7, %i4
2657         sub     %i4, 16, %i4
2658         sub     %i2, %i4, %i2
2659         sub     %i2, %i3, %i2
2660 
2661         andn    %i1, 0x3f, %l7          ! blk aligned address
2662         alignaddr %i1, %g0, %g0         ! gen %gsr
2663 
2664         srl     %i1, 3, %l5             ! bits 3,4,5 are now least sig in  %l5
2665         andcc   %l5, 7, %i5             ! mask everything except bits 1,2 3
2666         add     %i1, %i4, %i1
2667         add     %i1, %i3, %i1
2668 
2669         ldda    [%l7]ASI_BLK_P, %d0
2670         add     %l7, 64, %l7
2671         ldda    [%l7]ASI_BLK_P, %d16
2672         add     %l7, 64, %l7
2673         ldda    [%l7]ASI_BLK_P, %d32
2674         add     %l7, 64, %l7
2675         sub     %i3, 128, %i3
2676 
2677         ! switch statement to get us to the right 8 byte blk within a
2678         ! 64 byte block
2679 
2680         cmp      %i5, 4
2681         bgeu,a   copyout_hlf
2682         cmp      %i5, 6
2683         cmp      %i5, 2
2684         bgeu,a   copyout_sqtr
2685         nop
2686         cmp      %i5, 1
2687         be,a     copyout_seg1
2688         nop
2689         ba,pt    %ncc, copyout_seg0
2690         nop
2691 copyout_sqtr:
2692         be,a     copyout_seg2
2693         nop
2694         ba,pt    %ncc, copyout_seg3
2695         nop
2696 
2697 copyout_hlf:
2698         bgeu,a   copyout_fqtr
2699         nop      
2700         cmp      %i5, 5
2701         be,a     copyout_seg5
2702         nop
2703         ba,pt    %ncc, copyout_seg4
2704         nop
2705 copyout_fqtr:
2706         be,a     copyout_seg6
2707         nop
2708         ba,pt    %ncc, copyout_seg7
2709         nop
2710         
2711 copyout_seg0:
2712         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2713         FALIGN_D0
2714         ldda    [%l7]ASI_BLK_P, %d0
2715         stda    %d48, [%i0]ASI_BLK_AIUS
2716         add     %l7, 64, %l7
2717         subcc   %i3, 64, %i3
2718         bz,pn   %ncc, 0f
2719         add     %i0, 64, %i0
2720         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2721         FALIGN_D16
2722         ldda    [%l7]ASI_BLK_P, %d16
2723         stda    %d48, [%i0]ASI_BLK_AIUS
2724         add     %l7, 64, %l7
2725         subcc   %i3, 64, %i3
2726         bz,pn   %ncc, 1f
2727         add     %i0, 64, %i0
2728         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2729         FALIGN_D32
2730         ldda    [%l7]ASI_BLK_P, %d32
2731         stda    %d48, [%i0]ASI_BLK_AIUS
2732         add     %l7, 64, %l7
2733         subcc   %i3, 64, %i3
2734         bz,pn   %ncc, 2f
2735         add     %i0, 64, %i0
2736         ba,a,pt %ncc, copyout_seg0
2737 
2738 0:
2739         FALIGN_D16
2740         stda    %d48, [%i0]ASI_BLK_AIUS
2741         add     %i0, 64, %i0
2742         membar  #Sync
2743         FALIGN_D32
2744         stda    %d48, [%i0]ASI_BLK_AIUS
2745         ba,pt   %ncc, copyout_blkd0
2746         add     %i0, 64, %i0
2747 
2748 1:
2749         FALIGN_D32
2750         stda    %d48, [%i0]ASI_BLK_AIUS
2751         add     %i0, 64, %i0
2752         membar  #Sync
2753         FALIGN_D0
2754         stda    %d48, [%i0]ASI_BLK_AIUS
2755         ba,pt   %ncc, copyout_blkd16
2756         add     %i0, 64, %i0
2757 
2758 2:
2759         FALIGN_D0
2760         stda    %d48, [%i0]ASI_BLK_AIUS
2761         add     %i0, 64, %i0
2762         membar  #Sync
2763         FALIGN_D16
2764         stda    %d48, [%i0]ASI_BLK_AIUS
2765         ba,pt   %ncc, copyout_blkd32
2766         add     %i0, 64, %i0
2767 
2768 copyout_seg1:
2769         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2770         FALIGN_D2
2771         ldda    [%l7]ASI_BLK_P, %d0
2772         stda    %d48, [%i0]ASI_BLK_AIUS
2773         add     %l7, 64, %l7
2774         subcc   %i3, 64, %i3
2775         bz,pn   %ncc, 0f
2776         add     %i0, 64, %i0
2777         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2778         FALIGN_D18
2779         ldda    [%l7]ASI_BLK_P, %d16
2780         stda    %d48, [%i0]ASI_BLK_AIUS
2781         add     %l7, 64, %l7
2782         subcc   %i3, 64, %i3
2783         bz,pn   %ncc, 1f
2784         add     %i0, 64, %i0
2785         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2786         FALIGN_D34
2787         ldda    [%l7]ASI_BLK_P, %d32
2788         stda    %d48, [%i0]ASI_BLK_AIUS
2789         add     %l7, 64, %l7
2790         subcc   %i3, 64, %i3
2791         bz,pn   %ncc, 2f
2792         add     %i0, 64, %i0
2793         ba,a,pt %ncc, copyout_seg1
2794 0:
2795         FALIGN_D18
2796         stda    %d48, [%i0]ASI_BLK_AIUS
2797         add     %i0, 64, %i0
2798         membar  #Sync
2799         FALIGN_D34
2800         stda    %d48, [%i0]ASI_BLK_AIUS
2801         ba,pt   %ncc, copyout_blkd2
2802         add     %i0, 64, %i0
2803 
2804 1:
2805         FALIGN_D34
2806         stda    %d48, [%i0]ASI_BLK_AIUS
2807         add     %i0, 64, %i0
2808         membar  #Sync
2809         FALIGN_D2
2810         stda    %d48, [%i0]ASI_BLK_AIUS
2811         ba,pt   %ncc, copyout_blkd18
2812         add     %i0, 64, %i0
2813 
2814 2:
2815         FALIGN_D2
2816         stda    %d48, [%i0]ASI_BLK_AIUS
2817         add     %i0, 64, %i0
2818         membar  #Sync
2819         FALIGN_D18
2820         stda    %d48, [%i0]ASI_BLK_AIUS
2821         ba,pt   %ncc, copyout_blkd34
2822         add     %i0, 64, %i0
2823 
2824 copyout_seg2:
2825         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2826         FALIGN_D4
2827         ldda    [%l7]ASI_BLK_P, %d0
2828         stda    %d48, [%i0]ASI_BLK_AIUS
2829         add     %l7, 64, %l7
2830         subcc   %i3, 64, %i3
2831         bz,pn   %ncc, 0f
2832         add     %i0, 64, %i0
2833         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2834         FALIGN_D20
2835         ldda    [%l7]ASI_BLK_P, %d16
2836         stda    %d48, [%i0]ASI_BLK_AIUS
2837         add     %l7, 64, %l7
2838         subcc   %i3, 64, %i3
2839         bz,pn   %ncc, 1f
2840         add     %i0, 64, %i0
2841         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2842         FALIGN_D36
2843         ldda    [%l7]ASI_BLK_P, %d32
2844         stda    %d48, [%i0]ASI_BLK_AIUS
2845         add     %l7, 64, %l7
2846         subcc   %i3, 64, %i3
2847         bz,pn   %ncc, 2f
2848         add     %i0, 64, %i0
2849         ba,a,pt %ncc, copyout_seg2
2850 
2851 0:
2852         FALIGN_D20
2853         stda    %d48, [%i0]ASI_BLK_AIUS
2854         add     %i0, 64, %i0
2855         membar  #Sync
2856         FALIGN_D36
2857         stda    %d48, [%i0]ASI_BLK_AIUS
2858         ba,pt   %ncc, copyout_blkd4
2859         add     %i0, 64, %i0
2860 
2861 1:
2862         FALIGN_D36
2863         stda    %d48, [%i0]ASI_BLK_AIUS
2864         add     %i0, 64, %i0
2865         membar  #Sync
2866         FALIGN_D4
2867         stda    %d48, [%i0]ASI_BLK_AIUS
2868         ba,pt   %ncc, copyout_blkd20
2869         add     %i0, 64, %i0
2870 
2871 2:
2872         FALIGN_D4
2873         stda    %d48, [%i0]ASI_BLK_AIUS
2874         add     %i0, 64, %i0
2875         membar  #Sync
2876         FALIGN_D20
2877         stda    %d48, [%i0]ASI_BLK_AIUS
2878         ba,pt   %ncc, copyout_blkd36
2879         add     %i0, 64, %i0
2880 
2881 copyout_seg3:
2882         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2883         FALIGN_D6
2884         ldda    [%l7]ASI_BLK_P, %d0
2885         stda    %d48, [%i0]ASI_BLK_AIUS
2886         add     %l7, 64, %l7
2887         subcc   %i3, 64, %i3
2888         bz,pn   %ncc, 0f
2889         add     %i0, 64, %i0
2890         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2891         FALIGN_D22
2892         ldda    [%l7]ASI_BLK_P, %d16
2893         stda    %d48, [%i0]ASI_BLK_AIUS
2894         add     %l7, 64, %l7
2895         subcc   %i3, 64, %i3
2896         bz,pn   %ncc, 1f
2897         add     %i0, 64, %i0
2898         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2899         FALIGN_D38
2900         ldda    [%l7]ASI_BLK_P, %d32
2901         stda    %d48, [%i0]ASI_BLK_AIUS
2902         add     %l7, 64, %l7
2903         subcc   %i3, 64, %i3
2904         bz,pn   %ncc, 2f
2905         add     %i0, 64, %i0
2906         ba,a,pt %ncc, copyout_seg3
2907 
2908 0:
2909         FALIGN_D22
2910         stda    %d48, [%i0]ASI_BLK_AIUS
2911         add     %i0, 64, %i0
2912         membar  #Sync
2913         FALIGN_D38
2914         stda    %d48, [%i0]ASI_BLK_AIUS
2915         ba,pt   %ncc, copyout_blkd6
2916         add     %i0, 64, %i0
2917 
2918 1:
2919         FALIGN_D38
2920         stda    %d48, [%i0]ASI_BLK_AIUS
2921         add     %i0, 64, %i0
2922         membar  #Sync
2923         FALIGN_D6
2924         stda    %d48, [%i0]ASI_BLK_AIUS
2925         ba,pt   %ncc, copyout_blkd22
2926         add     %i0, 64, %i0
2927 
2928 2:
2929         FALIGN_D6
2930         stda    %d48, [%i0]ASI_BLK_AIUS
2931         add     %i0, 64, %i0
2932         membar  #Sync
2933         FALIGN_D22
2934         stda    %d48, [%i0]ASI_BLK_AIUS
2935         ba,pt   %ncc, copyout_blkd38
2936         add     %i0, 64, %i0
2937 
2938 copyout_seg4:
2939         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2940         FALIGN_D8
2941         ldda    [%l7]ASI_BLK_P, %d0
2942         stda    %d48, [%i0]ASI_BLK_AIUS
2943         add     %l7, 64, %l7
2944         subcc   %i3, 64, %i3
2945         bz,pn   %ncc, 0f
2946         add     %i0, 64, %i0
2947         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2948         FALIGN_D24
2949         ldda    [%l7]ASI_BLK_P, %d16
2950         stda    %d48, [%i0]ASI_BLK_AIUS
2951         add     %l7, 64, %l7
2952         subcc   %i3, 64, %i3
2953         bz,pn   %ncc, 1f
2954         add     %i0, 64, %i0
2955         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2956         FALIGN_D40
2957         ldda    [%l7]ASI_BLK_P, %d32
2958         stda    %d48, [%i0]ASI_BLK_AIUS
2959         add     %l7, 64, %l7
2960         subcc   %i3, 64, %i3
2961         bz,pn   %ncc, 2f
2962         add     %i0, 64, %i0
2963         ba,a,pt %ncc, copyout_seg4
2964 
2965 0:
2966         FALIGN_D24
2967         stda    %d48, [%i0]ASI_BLK_AIUS
2968         add     %i0, 64, %i0
2969         membar  #Sync
2970         FALIGN_D40
2971         stda    %d48, [%i0]ASI_BLK_AIUS
2972         ba,pt   %ncc, copyout_blkd8
2973         add     %i0, 64, %i0
2974 
2975 1:
2976         FALIGN_D40
2977         stda    %d48, [%i0]ASI_BLK_AIUS
2978         add     %i0, 64, %i0
2979         membar  #Sync
2980         FALIGN_D8
2981         stda    %d48, [%i0]ASI_BLK_AIUS
2982         ba,pt   %ncc, copyout_blkd24
2983         add     %i0, 64, %i0
2984 
2985 2:
2986         FALIGN_D8
2987         stda    %d48, [%i0]ASI_BLK_AIUS
2988         add     %i0, 64, %i0
2989         membar  #Sync
2990         FALIGN_D24
2991         stda    %d48, [%i0]ASI_BLK_AIUS
2992         ba,pt   %ncc, copyout_blkd40
2993         add     %i0, 64, %i0
2994 
2995 copyout_seg5:
2996         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2997         FALIGN_D10
2998         ldda    [%l7]ASI_BLK_P, %d0
2999         stda    %d48, [%i0]ASI_BLK_AIUS
3000         add     %l7, 64, %l7
3001         subcc   %i3, 64, %i3
3002         bz,pn   %ncc, 0f
3003         add     %i0, 64, %i0
3004         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3005         FALIGN_D26
3006         ldda    [%l7]ASI_BLK_P, %d16
3007         stda    %d48, [%i0]ASI_BLK_AIUS
3008         add     %l7, 64, %l7
3009         subcc   %i3, 64, %i3
3010         bz,pn   %ncc, 1f
3011         add     %i0, 64, %i0
3012         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3013         FALIGN_D42
3014         ldda    [%l7]ASI_BLK_P, %d32
3015         stda    %d48, [%i0]ASI_BLK_AIUS
3016         add     %l7, 64, %l7
3017         subcc   %i3, 64, %i3
3018         bz,pn   %ncc, 2f
3019         add     %i0, 64, %i0
3020         ba,a,pt %ncc, copyout_seg5
3021 
3022 0:
3023         FALIGN_D26
3024         stda    %d48, [%i0]ASI_BLK_AIUS
3025         add     %i0, 64, %i0
3026         membar  #Sync
3027         FALIGN_D42
3028         stda    %d48, [%i0]ASI_BLK_AIUS
3029         ba,pt   %ncc, copyout_blkd10
3030         add     %i0, 64, %i0
3031 
3032 1:
3033         FALIGN_D42
3034         stda    %d48, [%i0]ASI_BLK_AIUS
3035         add     %i0, 64, %i0
3036         membar  #Sync
3037         FALIGN_D10
3038         stda    %d48, [%i0]ASI_BLK_AIUS
3039         ba,pt   %ncc, copyout_blkd26
3040         add     %i0, 64, %i0
3041 
3042 2:
3043         FALIGN_D10
3044         stda    %d48, [%i0]ASI_BLK_AIUS
3045         add     %i0, 64, %i0
3046         membar  #Sync
3047         FALIGN_D26
3048         stda    %d48, [%i0]ASI_BLK_AIUS
3049         ba,pt   %ncc, copyout_blkd42
3050         add     %i0, 64, %i0
3051 
3052 copyout_seg6:
3053         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3054         FALIGN_D12
3055         ldda    [%l7]ASI_BLK_P, %d0
3056         stda    %d48, [%i0]ASI_BLK_AIUS
3057         add     %l7, 64, %l7
3058         subcc   %i3, 64, %i3
3059         bz,pn   %ncc, 0f
3060         add     %i0, 64, %i0
3061         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3062         FALIGN_D28
3063         ldda    [%l7]ASI_BLK_P, %d16
3064         stda    %d48, [%i0]ASI_BLK_AIUS
3065         add     %l7, 64, %l7
3066         subcc   %i3, 64, %i3
3067         bz,pn   %ncc, 1f
3068         add     %i0, 64, %i0
3069         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3070         FALIGN_D44
3071         ldda    [%l7]ASI_BLK_P, %d32
3072         stda    %d48, [%i0]ASI_BLK_AIUS
3073         add     %l7, 64, %l7
3074         subcc   %i3, 64, %i3
3075         bz,pn   %ncc, 2f
3076         add     %i0, 64, %i0
3077         ba,a,pt %ncc, copyout_seg6
3078 
3079 0:
3080         FALIGN_D28
3081         stda    %d48, [%i0]ASI_BLK_AIUS
3082         add     %i0, 64, %i0
3083         membar  #Sync
3084         FALIGN_D44
3085         stda    %d48, [%i0]ASI_BLK_AIUS
3086         ba,pt   %ncc, copyout_blkd12
3087         add     %i0, 64, %i0
3088 
3089 1:
3090         FALIGN_D44
3091         stda    %d48, [%i0]ASI_BLK_AIUS
3092         add     %i0, 64, %i0
3093         membar  #Sync
3094         FALIGN_D12
3095         stda    %d48, [%i0]ASI_BLK_AIUS
3096         ba,pt   %ncc, copyout_blkd28
3097         add     %i0, 64, %i0
3098 
3099 2:
3100         FALIGN_D12
3101         stda    %d48, [%i0]ASI_BLK_AIUS
3102         add     %i0, 64, %i0
3103         membar  #Sync
3104         FALIGN_D28
3105         stda    %d48, [%i0]ASI_BLK_AIUS
3106         ba,pt   %ncc, copyout_blkd44
3107         add     %i0, 64, %i0
3108 
3109 copyout_seg7:
3110         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3111         FALIGN_D14
3112         ldda    [%l7]ASI_BLK_P, %d0
3113         stda    %d48, [%i0]ASI_BLK_AIUS
3114         add     %l7, 64, %l7
3115         subcc   %i3, 64, %i3
3116         bz,pn   %ncc, 0f
3117         add     %i0, 64, %i0
3118         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3119         FALIGN_D30
3120         ldda    [%l7]ASI_BLK_P, %d16
3121         stda    %d48, [%i0]ASI_BLK_AIUS
3122         add     %l7, 64, %l7
3123         subcc   %i3, 64, %i3
3124         bz,pn   %ncc, 1f
3125         add     %i0, 64, %i0
3126         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3127         FALIGN_D46
3128         ldda    [%l7]ASI_BLK_P, %d32
3129         stda    %d48, [%i0]ASI_BLK_AIUS
3130         add     %l7, 64, %l7
3131         subcc   %i3, 64, %i3
3132         bz,pn   %ncc, 2f
3133         add     %i0, 64, %i0
3134         ba,a,pt %ncc, copyout_seg7
3135 
3136 0:
3137         FALIGN_D30
3138         stda    %d48, [%i0]ASI_BLK_AIUS
3139         add     %i0, 64, %i0
3140         membar  #Sync
3141         FALIGN_D46
3142         stda    %d48, [%i0]ASI_BLK_AIUS
3143         ba,pt   %ncc, copyout_blkd14
3144         add     %i0, 64, %i0
3145 
3146 1:
3147         FALIGN_D46
3148         stda    %d48, [%i0]ASI_BLK_AIUS
3149         add     %i0, 64, %i0
3150         membar  #Sync
3151         FALIGN_D14
3152         stda    %d48, [%i0]ASI_BLK_AIUS
3153         ba,pt   %ncc, copyout_blkd30
3154         add     %i0, 64, %i0
3155 
3156 2:
3157         FALIGN_D14
3158         stda    %d48, [%i0]ASI_BLK_AIUS
3159         add     %i0, 64, %i0
3160         membar  #Sync
3161         FALIGN_D30
3162         stda    %d48, [%i0]ASI_BLK_AIUS
3163         ba,pt   %ncc, copyout_blkd46
3164         add     %i0, 64, %i0
3165 
3166 
3167         !
3168         ! dribble out the last partial block
3169         !
3170 copyout_blkd0:
3171         subcc   %i4, 8, %i4
3172         blu,pn  %ncc, copyout_blkdone
3173         faligndata %d0, %d2, %d48
3174         stda    %d48, [%i0]ASI_USER
3175         add     %i0, 8, %i0
3176 copyout_blkd2:
3177         subcc   %i4, 8, %i4
3178         blu,pn  %ncc, copyout_blkdone
3179         faligndata %d2, %d4, %d48
3180         stda    %d48, [%i0]ASI_USER
3181         add     %i0, 8, %i0
3182 copyout_blkd4:
3183         subcc   %i4, 8, %i4
3184         blu,pn  %ncc, copyout_blkdone
3185         faligndata %d4, %d6, %d48
3186         stda    %d48, [%i0]ASI_USER
3187         add     %i0, 8, %i0
3188 copyout_blkd6:
3189         subcc   %i4, 8, %i4
3190         blu,pn  %ncc, copyout_blkdone
3191         faligndata %d6, %d8, %d48
3192         stda    %d48, [%i0]ASI_USER
3193         add     %i0, 8, %i0
3194 copyout_blkd8:
3195         subcc   %i4, 8, %i4
3196         blu,pn  %ncc, copyout_blkdone
3197         faligndata %d8, %d10, %d48
3198         stda    %d48, [%i0]ASI_USER
3199         add     %i0, 8, %i0
3200 copyout_blkd10:
3201         subcc   %i4, 8, %i4
3202         blu,pn  %ncc, copyout_blkdone
3203         faligndata %d10, %d12, %d48
3204         stda    %d48, [%i0]ASI_USER
3205         add     %i0, 8, %i0
3206 copyout_blkd12:
3207         subcc   %i4, 8, %i4
3208         blu,pn  %ncc, copyout_blkdone
3209         faligndata %d12, %d14, %d48
3210         stda    %d48, [%i0]ASI_USER
3211         add     %i0, 8, %i0
3212 copyout_blkd14:
3213         subcc   %i4, 8, %i4
3214         blu,pn  %ncc, copyout_blkdone
3215         fsrc1   %d14, %d0
3216         ba,a,pt %ncc, copyout_blkleft
3217 
3218 copyout_blkd16:
3219         subcc   %i4, 8, %i4
3220         blu,pn  %ncc, copyout_blkdone
3221         faligndata %d16, %d18, %d48
3222         stda    %d48, [%i0]ASI_USER
3223         add     %i0, 8, %i0
3224 copyout_blkd18:
3225         subcc   %i4, 8, %i4
3226         blu,pn  %ncc, copyout_blkdone
3227         faligndata %d18, %d20, %d48
3228         stda    %d48, [%i0]ASI_USER
3229         add     %i0, 8, %i0
3230 copyout_blkd20:
3231         subcc   %i4, 8, %i4
3232         blu,pn  %ncc, copyout_blkdone
3233         faligndata %d20, %d22, %d48
3234         stda    %d48, [%i0]ASI_USER
3235         add     %i0, 8, %i0
3236 copyout_blkd22:
3237         subcc   %i4, 8, %i4
3238         blu,pn  %ncc, copyout_blkdone
3239         faligndata %d22, %d24, %d48
3240         stda    %d48, [%i0]ASI_USER
3241         add     %i0, 8, %i0
3242 copyout_blkd24:
3243         subcc   %i4, 8, %i4
3244         blu,pn  %ncc, copyout_blkdone
3245         faligndata %d24, %d26, %d48
3246         stda    %d48, [%i0]ASI_USER
3247         add     %i0, 8, %i0
3248 copyout_blkd26:
3249         subcc   %i4, 8, %i4
3250         blu,pn  %ncc, copyout_blkdone
3251         faligndata %d26, %d28, %d48
3252         stda    %d48, [%i0]ASI_USER
3253         add     %i0, 8, %i0
3254 copyout_blkd28:
3255         subcc   %i4, 8, %i4
3256         blu,pn  %ncc, copyout_blkdone
3257         faligndata %d28, %d30, %d48
3258         stda    %d48, [%i0]ASI_USER
3259         add     %i0, 8, %i0
3260 copyout_blkd30:
3261         subcc   %i4, 8, %i4
3262         blu,pn  %ncc, copyout_blkdone
3263         fsrc1   %d30, %d0
3264         ba,a,pt %ncc, copyout_blkleft
3265 copyout_blkd32:
3266         subcc   %i4, 8, %i4
3267         blu,pn  %ncc, copyout_blkdone
3268         faligndata %d32, %d34, %d48
3269         stda    %d48, [%i0]ASI_USER
3270         add     %i0, 8, %i0
3271 copyout_blkd34:
3272         subcc   %i4, 8, %i4
3273         blu,pn  %ncc, copyout_blkdone
3274         faligndata %d34, %d36, %d48
3275         stda    %d48, [%i0]ASI_USER
3276         add     %i0, 8, %i0
3277 copyout_blkd36:
3278         subcc   %i4, 8, %i4
3279         blu,pn  %ncc, copyout_blkdone
3280         faligndata %d36, %d38, %d48
3281         stda    %d48, [%i0]ASI_USER
3282         add     %i0, 8, %i0
3283 copyout_blkd38:
3284         subcc   %i4, 8, %i4
3285         blu,pn  %ncc, copyout_blkdone
3286         faligndata %d38, %d40, %d48
3287         stda    %d48, [%i0]ASI_USER
3288         add     %i0, 8, %i0
3289 copyout_blkd40:
3290         subcc   %i4, 8, %i4
3291         blu,pn  %ncc, copyout_blkdone
3292         faligndata %d40, %d42, %d48
3293         stda    %d48, [%i0]ASI_USER
3294         add     %i0, 8, %i0
3295 copyout_blkd42:
3296         subcc   %i4, 8, %i4
3297         blu,pn  %ncc, copyout_blkdone
3298         faligndata %d42, %d44, %d48
3299         stda    %d48, [%i0]ASI_USER
3300         add     %i0, 8, %i0
3301 copyout_blkd44:
3302         subcc   %i4, 8, %i4
3303         blu,pn  %ncc, copyout_blkdone
3304         faligndata %d44, %d46, %d48
3305         stda    %d48, [%i0]ASI_USER
3306         add     %i0, 8, %i0
3307 copyout_blkd46:
3308         subcc   %i4, 8, %i4
3309         blu,pn  %ncc, copyout_blkdone
3310         fsrc1   %d46, %d0
3311 
3312 copyout_blkleft:
3313 1:
3314         ldd     [%l7], %d2
3315         add     %l7, 8, %l7
3316         subcc   %i4, 8, %i4
3317         faligndata %d0, %d2, %d8
3318         stda    %d8, [%i0]ASI_USER
3319         blu,pn  %ncc, copyout_blkdone
3320         add     %i0, 8, %i0
3321         ldd     [%l7], %d0
3322         add     %l7, 8, %l7
3323         subcc   %i4, 8, %i4
3324         faligndata %d2, %d0, %d8
3325         stda    %d8, [%i0]ASI_USER
3326         bgeu,pt %ncc, 1b
3327         add     %i0, 8, %i0
3328 
3329 copyout_blkdone:
3330         tst     %i2
3331         bz,pt   %ncc, .copyout_exit
3332         and     %l3, 0x4, %l3           ! fprs.du = fprs.dl = 0
3333 
3334 7:      ldub    [%i1], %i4
3335         inc     %i1
3336         stba    %i4, [%i0]ASI_USER
3337         inc     %i0
3338         deccc   %i2
3339         bgu     %ncc, 7b
3340           nop
3341 
3342 .copyout_exit:
3343         membar  #StoreLoad|#StoreStore
3344         btst    FPUSED_FLAG, SAVED_LOFAULT
3345         bz      1f
3346           nop
3347 
3348         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
3349         wr      %o2, 0, %gsr            ! restore gsr
3350 
3351         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3352         btst    FPRS_FEF, %o3
3353         bz      4f
3354           nop
3355 
3356         ! restore fpregs from stack
3357         membar  #Sync
3358         add     %fp, STACK_BIAS - 257, %o2
3359         and     %o2, -64, %o2
3360         ldda    [%o2]ASI_BLK_P, %d0
3361         add     %o2, 64, %o2
3362         ldda    [%o2]ASI_BLK_P, %d16
3363         add     %o2, 64, %o2
3364         ldda    [%o2]ASI_BLK_P, %d32
3365         add     %o2, 64, %o2
3366         ldda    [%o2]ASI_BLK_P, %d48
3367         membar  #Sync
3368 
3369         ba,pt   %ncc, 1f
3370           wr    %o3, 0, %fprs           ! restore fprs
3371 
3372 4:
3373         FZERO                           ! zero all of the fpregs
3374         wr      %o3, 0, %fprs           ! restore fprs
3375 
3376 1:
3377         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3378         membar  #Sync                   ! sync error barrier
3379         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3380         ret
3381         restore %g0, 0, %o0
3382 
3383 .copyout_err:
3384         ldn     [THREAD_REG + T_COPYOPS], %o4
3385         brz     %o4, 2f
3386         nop
3387         ldn     [%o4 + CP_COPYOUT], %g2
3388         jmp     %g2
3389         nop
3390 2:
3391         retl
3392         mov     -1, %o0
3393         SET_SIZE(copyout)
3394 
3395 #endif  /* lint */
3396 
3397 
3398 #ifdef  lint
3399 
3400 /*ARGSUSED*/
3401 int
3402 xcopyout(const void *kaddr, void *uaddr, size_t count)
3403 { return (0); }
3404 
3405 #else   /* lint */
3406 
3407         ENTRY(xcopyout)
3408         sethi   %hi(.xcopyout_err), REAL_LOFAULT
3409         b       .do_copyout
3410           or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
3411 .xcopyout_err:
3412         ldn     [THREAD_REG + T_COPYOPS], %o4
3413         brz     %o4, 2f
3414         nop
3415         ldn     [%o4 + CP_XCOPYOUT], %g2
3416         jmp     %g2
3417         nop
3418 2:
3419         retl
3420         mov     %g1, %o0
3421         SET_SIZE(xcopyout)
3422 
3423 #endif  /* lint */
3424         
3425 #ifdef  lint
3426 
3427 /*ARGSUSED*/
3428 int
3429 xcopyout_little(const void *kaddr, void *uaddr, size_t count)
3430 { return (0); }
3431 
3432 #else   /* lint */
3433 
3434         ENTRY(xcopyout_little)
3435         sethi   %hi(.little_err), %o4
3436         ldn     [THREAD_REG + T_LOFAULT], %o5
3437         or      %o4, %lo(.little_err), %o4
3438         membar  #Sync                   ! sync error barrier
3439         stn     %o4, [THREAD_REG + T_LOFAULT]
3440 
3441         subcc   %g0, %o2, %o3
3442         add     %o0, %o2, %o0
3443         bz,pn   %ncc, 2f                ! check for zero bytes
3444         sub     %o2, 1, %o4
3445         add     %o0, %o4, %o0           ! start w/last byte
3446         add     %o1, %o2, %o1
3447         ldub    [%o0+%o3], %o4
3448 
3449 1:      stba    %o4, [%o1+%o3]ASI_AIUSL
3450         inccc   %o3
3451         sub     %o0, 2, %o0             ! get next byte
3452         bcc,a,pt %ncc, 1b
3453           ldub  [%o0+%o3], %o4
3454 
3455 2:      membar  #Sync                   ! sync error barrier
3456         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3457         retl
3458         mov     %g0, %o0                ! return (0)
3459         SET_SIZE(xcopyout_little)
3460 
3461 #endif  /* lint */
3462 
3463 /*
3464  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
3465  */
3466 
3467 #if defined(lint)
3468 
3469 /*ARGSUSED*/
3470 int
3471 copyin(const void *uaddr, void *kaddr, size_t count)
3472 { return (0); }
3473 
3474 #else   /* lint */
3475 
3476         ENTRY(copyin)
3477         sethi   %hi(.copyin_err), REAL_LOFAULT
3478         or      REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
3479 
3480 .do_copyin:
3481         !
3482         ! Check the length and bail if zero.
3483         !
3484         tst     %o2
3485         bnz,pt  %ncc, 1f
3486           nop
3487         retl
3488           clr   %o0
3489 1:
3490         sethi   %hi(copyio_fault), %o4
3491         or      %o4, %lo(copyio_fault), %o4
3492         sethi   %hi(copyio_fault_nowindow), %o3
3493         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3494         or      %o3, %lo(copyio_fault_nowindow), %o3
3495         membar  #Sync
3496         stn     %o3, [THREAD_REG + T_LOFAULT]
3497 
3498         mov     %o0, SAVE_SRC
3499         mov     %o1, SAVE_DST
3500         mov     %o2, SAVE_COUNT
3501 
3502         !
3503         ! Check to see if we're more than SMALL_LIMIT.
3504         !
3505         subcc   %o2, SMALL_LIMIT, %o3
3506         bgu,a,pt %ncc, .dci_ns
3507         or      %o0, %o1, %o3
3508         !
3509         ! What was previously ".small_copyin"
3510         !
3511 .dcibcp:
3512         sub     %g0, %o2, %o3           ! setup for copy loop
3513         add     %o0, %o2, %o0
3514         add     %o1, %o2, %o1
3515         ba,pt   %ncc, .dcicl
3516         lduba   [%o0 + %o3]ASI_USER, %o4
3517         !
3518         ! %o0 and %o1 point at the end and remain pointing at the end
3519         ! of their buffers. We pull things out by adding %o3 (which is
3520         ! the negation of the length) to the buffer end which gives us
3521         ! the curent location in the buffers. By incrementing %o3 we walk
3522         ! through both buffers without having to bump each buffer's
3523         ! pointer. A very fast 4 instruction loop.
3524         !
3525         .align 16
3526 .dcicl:
3527         stb     %o4, [%o1 + %o3]
3528         inccc   %o3
3529         bl,a,pt %ncc, .dcicl
3530         lduba   [%o0 + %o3]ASI_USER, %o4
3531         !
3532         ! We're done. Go home.
3533         !       
3534         membar  #Sync
3535         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3536         retl
3537         clr     %o0
3538         !
3539         ! Try aligned copies from here.
3540         !
3541 .dci_ns:
3542         !
3543         ! See if we're single byte aligned. If we are, check the
3544         ! limit for single byte copies. If we're smaller, or equal,
3545         ! bounce to the byte for byte copy loop. Otherwise do it in
3546         ! HW (if enabled).
3547         !
3548         btst    1, %o3
3549         bz,a,pt %icc, .dcih8
3550         btst    7, %o3
3551         !
3552         ! We're single byte aligned.
3553         !
3554         subcc   %o2, VIS_COPY_THRESHOLD, %o3
3555         bleu,pt %ncc, .dcibcp
3556         sethi   %hi(hw_copy_limit_1), %o3
3557         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3558         !
3559         ! Is HW copy on? If not do everything byte for byte.
3560         !
3561         tst     %o3
3562         bz,pn   %icc, .dcibcp
3563         subcc   %o3, %o2, %o3
3564         !
3565         ! Are we bigger than the HW limit? If not
3566         ! go to byte for byte.
3567         !
3568         bge,pt  %ncc, .dcibcp
3569         nop
3570         !
3571         ! We're big enough and copy is on. Do it with HW.
3572         !
3573         ba,pt   %ncc, .big_copyin
3574         nop
3575 .dcih8:
3576         !
3577         ! 8 byte aligned?
3578         !
3579         bnz,a   %ncc, .dcih4
3580         btst    3, %o3
3581         !
3582         ! We're eight byte aligned.
3583         !
3584         subcc   %o2, VIS_COPY_THRESHOLD, %o3
3585         bleu,pt %ncc, .dcis8
3586         sethi   %hi(hw_copy_limit_8), %o3
3587         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3588         !
3589         ! Is HW assist on? If not, do it with the aligned copy.
3590         !
3591         tst     %o3
3592         bz,pn   %icc, .dcis8
3593         subcc   %o3, %o2, %o3
3594         bge     %ncc, .dcis8
3595         nop
3596         ba,pt   %ncc, .big_copyin
3597         nop
3598 .dcis8:
3599         !
3600         ! Housekeeping for copy loops. Uses same idea as in the byte for
3601         ! byte copy loop above.
3602         !
3603         add     %o0, %o2, %o0
3604         add     %o1, %o2, %o1
3605         sub     %g0, %o2, %o3
3606         ba,pt   %ncc, .didebc
3607         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
3608         !
3609         ! 4 byte aligned?
3610         !
3611 .dcih4:
3612         bnz     %ncc, .dcih2
3613         subcc   %o2, VIS_COPY_THRESHOLD, %o3
3614         bleu,pt %ncc, .dcis4
3615         sethi   %hi(hw_copy_limit_4), %o3
3616         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3617         !
3618         ! Is HW assist on? If not, do it with the aligned copy.
3619         !
3620         tst     %o3
3621         bz,pn   %icc, .dcis4
3622         subcc   %o3, %o2, %o3
3623         !
3624         ! We're negative if our size is less than or equal to hw_copy_limit_4.
3625         !
3626         bge     %ncc, .dcis4
3627         nop
3628         ba,pt   %ncc, .big_copyin
3629         nop
3630 .dcis4:
3631         !
3632         ! Housekeeping for copy loops. Uses same idea as in the byte
3633         ! for byte copy loop above.
3634         !
3635         add     %o0, %o2, %o0
3636         add     %o1, %o2, %o1
3637         sub     %g0, %o2, %o3
3638         ba,pt   %ncc, .didfbc
3639         srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
3640 .dcih2:
3641         !
3642         ! We're two byte aligned. Check for "smallness"
3643         ! done in delay at .dcih4
3644         !
3645         bleu,pt %ncc, .dcis2
3646         sethi   %hi(hw_copy_limit_2), %o3
3647         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3648         !
3649         ! Is HW assist on? If not, do it with the aligned copy.
3650         !
3651         tst     %o3
3652         bz,pn   %icc, .dcis2
3653         subcc   %o3, %o2, %o3
3654         !
3655         ! Are we larger than the HW limit?
3656         !
3657         bge     %ncc, .dcis2
3658         nop
3659         !
3660         ! HW assist is on and we're large enough to use it.
3661         !
3662         ba,pt   %ncc, .big_copyin
3663         nop
3664         !
3665         ! Housekeeping for copy loops. Uses same idea as in the byte
3666         ! for byte copy loop above.
3667         !
3668 .dcis2:
3669         add     %o0, %o2, %o0
3670         add     %o1, %o2, %o1
3671         sub     %g0, %o2, %o3
3672         ba,pt   %ncc, .didtbc
3673         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
3674         !
3675 .small_copyin:
3676         !
3677         ! Why are we doing this AGAIN? There are certain conditions in
3678         ! big copyin that will cause us to forgo the HW assisted copys
3679         ! and bounce back to a non-hw assisted copy. This dispatches
3680         ! those copies. Note that we branch around this in the main line
3681         ! code.
3682         !
3683         ! We make no check for limits or HW enablement here. We've
3684         ! already been told that we're a poster child so just go off
3685         ! and do it.
3686         !
3687         or      %o0, %o1, %o3
3688         btst    1, %o3
3689         bnz     %icc, .dcibcp           ! Most likely
3690         btst    7, %o3
3691         bz      %icc, .dcis8
3692         btst    3, %o3
3693         bz      %icc, .dcis4
3694         nop
3695         ba,pt   %ncc, .dcis2
3696         nop
3697         !
3698         ! Eight byte aligned copies. A steal from the original .small_copyin
3699         ! with modifications. %o2 is number of 8 byte chunks to copy. When
3700         ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
3701         ! to copy.
3702         !
3703         .align 32
3704 .didebc:
3705         ldxa    [%o0 + %o3]ASI_USER, %o4
3706         deccc   %o2
3707         stx     %o4, [%o1 + %o3]
3708         bg,pt   %ncc, .didebc
3709         addcc   %o3, 8, %o3
3710         !
3711         ! End of copy loop. Most 8 byte aligned copies end here.
3712         !
3713         bz,pt   %ncc, .dcifh
3714         nop
3715         !
3716         ! Something is left. Do it byte for byte.
3717         !
3718         ba,pt   %ncc, .dcicl
3719         lduba   [%o0 + %o3]ASI_USER, %o4
3720         !
3721         ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
3722         !
3723         .align 32
3724 .didfbc:
3725         lduwa   [%o0 + %o3]ASI_USER, %o4
3726         deccc   %o2
3727         st      %o4, [%o1 + %o3]
3728         bg,pt   %ncc, .didfbc
3729         addcc   %o3, 4, %o3
3730         !
3731         ! End of copy loop. Most 4 byte aligned copies end here.
3732         !
3733         bz,pt   %ncc, .dcifh
3734         nop
3735         !
3736         ! Something is left. Do it byte for byte.
3737         !
3738         ba,pt   %ncc, .dcicl
3739         lduba   [%o0 + %o3]ASI_USER, %o4
3740         !
3741         ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
3742         ! copy.
3743         !
3744         .align 32
3745 .didtbc:
3746         lduha   [%o0 + %o3]ASI_USER, %o4
3747         deccc   %o2
3748         sth     %o4, [%o1 + %o3]
3749         bg,pt   %ncc, .didtbc
3750         addcc   %o3, 2, %o3
3751         !
3752         ! End of copy loop. Most 2 byte aligned copies end here.
3753         !
3754         bz,pt   %ncc, .dcifh
3755         nop
3756         !
3757         ! Deal with the last byte
3758         !
3759         lduba   [%o0 + %o3]ASI_USER, %o4
3760         stb     %o4, [%o1 + %o3]
3761 .dcifh:
3762         membar  #Sync
3763         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3764         retl
3765         clr     %o0
3766 
3767 .big_copyin:
3768         !
3769         ! Are we using the FP registers?
3770         !
3771         rd      %fprs, %o3              ! check for unused fp
3772         btst    FPRS_FEF, %o3
3773         bnz     %ncc, .copyin_fpregs_inuse
3774         nop
3775         !
3776         ! We're going off to do a block copy.
3777         ! Switch fault hendlers and grab a window. We
3778         ! don't do a membar #Sync since we've done only
3779         ! kernel data to this point.
3780         !
3781         stn     %o4, [THREAD_REG + T_LOFAULT]
3782         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3783         !
3784         ! %o3 is %i3 after the save...
3785         !
3786         st      %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
3787         ba,pt   %ncc, .do_blockcopyin
3788         wr      %g0, FPRS_FEF, %fprs
3789 .copyin_fpregs_inuse:
3790         !
3791         ! We're here if the FP regs are in use. Need to see if the request
3792         ! exceeds our suddenly larger minimum.
3793         !
3794         cmp     %i2, VIS_COPY_THRESHOLD+(64*4)
3795         bl      %ncc, .small_copyin
3796         nop
3797         !
3798         ! We're going off and do a block copy.
3799         ! Change to the heavy duty fault handler and grab a window first.
3800         ! New handler is passed in
3801         !
3802         stn     %o4, [THREAD_REG + T_LOFAULT]
3803         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3804         !
3805         ! %o3 is now %i3
3806         !
3807         st      %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
3808 
3809         ! save in-use fpregs on stack
3810         wr      %g0, FPRS_FEF, %fprs
3811         membar  #Sync
3812         add     %fp, STACK_BIAS - 257, %o2
3813         and     %o2, -64, %o2
3814         stda    %d0, [%o2]ASI_BLK_P
3815         add     %o2, 64, %o2
3816         stda    %d16, [%o2]ASI_BLK_P
3817         add     %o2, 64, %o2
3818         stda    %d32, [%o2]ASI_BLK_P
3819         add     %o2, 64, %o2
3820         stda    %d48, [%o2]ASI_BLK_P
3821         membar  #Sync
3822 
3823 .do_blockcopyin:
3824         membar  #StoreStore|#StoreLoad|#LoadStore
3825 
3826         rd      %gsr, %o2
3827         st      %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
3828 
3829         ! Set the lower bit in the saved t_lofault to indicate
3830         ! that we need to clear the %fprs register on the way
3831         ! out
3832         or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3833 
3834         ! Swap src/dst since the code below is memcpy code
3835         ! and memcpy/bcopy have different calling sequences
3836         mov     %i1, %i5
3837         mov     %i0, %i1
3838         mov     %i5, %i0
3839 
3840 !!! This code is nearly identical to the version in the sun4u
3841 !!! libc_psr.  Most bugfixes made to that file should be
3842 !!! merged into this routine.
3843 
3844         andcc   %i0, 7, %o3
3845         bz      copyin_blkcpy
3846         sub     %o3, 8, %o3
3847         neg     %o3
3848         sub     %i2, %o3, %i2
3849 
3850         ! Align Destination on double-word boundary
3851 
3852 2:      lduba   [%i1]ASI_USER, %o4
3853         inc     %i1
3854         inc     %i0
3855         deccc   %o3
3856         bgu     %ncc, 2b
3857         stb     %o4, [%i0-1]
3858 copyin_blkcpy:
3859         andcc   %i0, 63, %i3
3860         bz,pn   %ncc, copyin_blalign    ! now block aligned
3861         sub     %i3, 64, %i3
3862         neg     %i3                     ! bytes till block aligned
3863         sub     %i2, %i3, %i2           ! update %i2 with new count
3864 
3865         ! Copy %i3 bytes till dst is block (64 byte) aligned. use
3866         ! double word copies.
3867 
3868         alignaddr %i1, %g0, %g1
3869         ldda    [%g1]ASI_USER, %d0
3870         add     %g1, 8, %g1
3871 6:
3872         ldda    [%g1]ASI_USER, %d2
3873         add     %g1, 8, %g1
3874         subcc   %i3, 8, %i3
3875         faligndata %d0, %d2, %d8
3876         std     %d8, [%i0]
3877         add     %i1, 8, %i1
3878         bz,pn   %ncc, copyin_blalign
3879         add     %i0, 8, %i0
3880         ldda    [%g1]ASI_USER, %d0
3881         add     %g1, 8, %g1
3882         subcc   %i3, 8, %i3
3883         faligndata %d2, %d0, %d8
3884         std     %d8, [%i0]
3885         add     %i1, 8, %i1
3886         bgu,pn  %ncc, 6b
3887         add     %i0, 8, %i0
3888  
3889 copyin_blalign:
3890         membar  #StoreLoad
3891         ! %i2 = total length
3892         ! %i3 = blocks  (length - 64) / 64
3893         ! %i4 = doubles remaining  (length - blocks)
3894         sub     %i2, 64, %i3
3895         andn    %i3, 63, %i3
3896         sub     %i2, %i3, %i4
3897         andn    %i4, 7, %i4
3898         sub     %i4, 16, %i4
3899         sub     %i2, %i4, %i2
3900         sub     %i2, %i3, %i2
3901 
3902         andn    %i1, 0x3f, %l7          ! blk aligned address
3903         alignaddr %i1, %g0, %g0         ! gen %gsr
3904 
3905         srl     %i1, 3, %l5             ! bits 3,4,5 are now least sig in  %l5
3906         andcc   %l5, 7, %i5             ! mask everything except bits 1,2 3
3907         add     %i1, %i4, %i1
3908         add     %i1, %i3, %i1
3909 
3910         ldda    [%l7]ASI_BLK_AIUS, %d0
3911         add     %l7, 64, %l7
3912         ldda    [%l7]ASI_BLK_AIUS, %d16
3913         add     %l7, 64, %l7
3914         ldda    [%l7]ASI_BLK_AIUS, %d32
3915         add     %l7, 64, %l7
3916         sub     %i3, 128, %i3
3917 
3918         ! switch statement to get us to the right 8 byte blk within a
3919         ! 64 byte block
3920 
3921         cmp      %i5, 4
3922         bgeu,a   copyin_hlf
3923         cmp      %i5, 6
3924         cmp      %i5, 2
3925         bgeu,a   copyin_sqtr
3926         nop
3927         cmp      %i5, 1
3928         be,a     copyin_seg1
3929         nop
3930         ba,pt    %ncc, copyin_seg0
3931         nop
3932 copyin_sqtr:
3933         be,a     copyin_seg2
3934         nop
3935         ba,pt    %ncc, copyin_seg3
3936         nop
3937 
3938 copyin_hlf:
3939         bgeu,a   copyin_fqtr
3940         nop      
3941         cmp      %i5, 5
3942         be,a     copyin_seg5
3943         nop
3944         ba,pt    %ncc, copyin_seg4
3945         nop
3946 copyin_fqtr:
3947         be,a     copyin_seg6
3948         nop
3949         ba,pt    %ncc, copyin_seg7
3950         nop
3951         
3952 copyin_seg0:
3953         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3954         FALIGN_D0
3955         ldda    [%l7]ASI_BLK_AIUS, %d0
3956         stda    %d48, [%i0]ASI_BLK_P
3957         add     %l7, 64, %l7
3958         subcc   %i3, 64, %i3
3959         bz,pn   %ncc, 0f
3960         add     %i0, 64, %i0
3961         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3962         FALIGN_D16
3963         ldda    [%l7]ASI_BLK_AIUS, %d16
3964         stda    %d48, [%i0]ASI_BLK_P
3965         add     %l7, 64, %l7
3966         subcc   %i3, 64, %i3
3967         bz,pn   %ncc, 1f
3968         add     %i0, 64, %i0
3969         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3970         FALIGN_D32
3971         ldda    [%l7]ASI_BLK_AIUS, %d32
3972         stda    %d48, [%i0]ASI_BLK_P
3973         add     %l7, 64, %l7
3974         subcc   %i3, 64, %i3
3975         bz,pn   %ncc, 2f
3976         add     %i0, 64, %i0
3977         ba,a,pt %ncc, copyin_seg0
3978 
3979 0:
3980         FALIGN_D16
3981         stda    %d48, [%i0]ASI_BLK_P
3982         add     %i0, 64, %i0
3983         membar  #Sync
3984         FALIGN_D32
3985         stda    %d48, [%i0]ASI_BLK_P
3986         ba,pt   %ncc, copyin_blkd0
3987         add     %i0, 64, %i0
3988 
3989 1:
3990         FALIGN_D32
3991         stda    %d48, [%i0]ASI_BLK_P
3992         add     %i0, 64, %i0
3993         membar  #Sync
3994         FALIGN_D0
3995         stda    %d48, [%i0]ASI_BLK_P
3996         ba,pt   %ncc, copyin_blkd16
3997         add     %i0, 64, %i0
3998 
3999 2:
4000         FALIGN_D0
4001         stda    %d48, [%i0]ASI_BLK_P
4002         add     %i0, 64, %i0
4003         membar  #Sync
4004         FALIGN_D16
4005         stda    %d48, [%i0]ASI_BLK_P
4006         ba,pt   %ncc, copyin_blkd32
4007         add     %i0, 64, %i0
4008 
4009 copyin_seg1:
4010         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4011         FALIGN_D2
4012         ldda    [%l7]ASI_BLK_AIUS, %d0
4013         stda    %d48, [%i0]ASI_BLK_P
4014         add     %l7, 64, %l7
4015         subcc   %i3, 64, %i3
4016         bz,pn   %ncc, 0f
4017         add     %i0, 64, %i0
4018         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4019         FALIGN_D18
4020         ldda    [%l7]ASI_BLK_AIUS, %d16
4021         stda    %d48, [%i0]ASI_BLK_P
4022         add     %l7, 64, %l7
4023         subcc   %i3, 64, %i3
4024         bz,pn   %ncc, 1f
4025         add     %i0, 64, %i0
4026         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4027         FALIGN_D34
4028         ldda    [%l7]ASI_BLK_AIUS, %d32
4029         stda    %d48, [%i0]ASI_BLK_P
4030         add     %l7, 64, %l7
4031         subcc   %i3, 64, %i3
4032         bz,pn   %ncc, 2f
4033         add     %i0, 64, %i0
4034         ba,a,pt %ncc, copyin_seg1
4035 0:
4036         FALIGN_D18
4037         stda    %d48, [%i0]ASI_BLK_P
4038         add     %i0, 64, %i0
4039         membar  #Sync
4040         FALIGN_D34
4041         stda    %d48, [%i0]ASI_BLK_P
4042         ba,pt   %ncc, copyin_blkd2
4043         add     %i0, 64, %i0
4044 
4045 1:
4046         FALIGN_D34
4047         stda    %d48, [%i0]ASI_BLK_P
4048         add     %i0, 64, %i0
4049         membar  #Sync
4050         FALIGN_D2
4051         stda    %d48, [%i0]ASI_BLK_P
4052         ba,pt   %ncc, copyin_blkd18
4053         add     %i0, 64, %i0
4054 
4055 2:
4056         FALIGN_D2
4057         stda    %d48, [%i0]ASI_BLK_P
4058         add     %i0, 64, %i0
4059         membar  #Sync
4060         FALIGN_D18
4061         stda    %d48, [%i0]ASI_BLK_P
4062         ba,pt   %ncc, copyin_blkd34
4063         add     %i0, 64, %i0
4064 copyin_seg2:
4065         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4066         FALIGN_D4
4067         ldda    [%l7]ASI_BLK_AIUS, %d0
4068         stda    %d48, [%i0]ASI_BLK_P
4069         add     %l7, 64, %l7
4070         subcc   %i3, 64, %i3
4071         bz,pn   %ncc, 0f
4072         add     %i0, 64, %i0
4073         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4074         FALIGN_D20
4075         ldda    [%l7]ASI_BLK_AIUS, %d16
4076         stda    %d48, [%i0]ASI_BLK_P
4077         add     %l7, 64, %l7
4078         subcc   %i3, 64, %i3
4079         bz,pn   %ncc, 1f
4080         add     %i0, 64, %i0
4081         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4082         FALIGN_D36
4083         ldda    [%l7]ASI_BLK_AIUS, %d32
4084         stda    %d48, [%i0]ASI_BLK_P
4085         add     %l7, 64, %l7
4086         subcc   %i3, 64, %i3
4087         bz,pn   %ncc, 2f
4088         add     %i0, 64, %i0
4089         ba,a,pt %ncc, copyin_seg2
4090 
4091 0:
4092         FALIGN_D20
4093         stda    %d48, [%i0]ASI_BLK_P
4094         add     %i0, 64, %i0
4095         membar  #Sync
4096         FALIGN_D36
4097         stda    %d48, [%i0]ASI_BLK_P
4098         ba,pt   %ncc, copyin_blkd4
4099         add     %i0, 64, %i0
4100 
4101 1:
4102         FALIGN_D36
4103         stda    %d48, [%i0]ASI_BLK_P
4104         add     %i0, 64, %i0
4105         membar  #Sync
4106         FALIGN_D4
4107         stda    %d48, [%i0]ASI_BLK_P
4108         ba,pt   %ncc, copyin_blkd20
4109         add     %i0, 64, %i0
4110 
4111 2:
4112         FALIGN_D4
4113         stda    %d48, [%i0]ASI_BLK_P
4114         add     %i0, 64, %i0
4115         membar  #Sync
4116         FALIGN_D20
4117         stda    %d48, [%i0]ASI_BLK_P
4118         ba,pt   %ncc, copyin_blkd36
4119         add     %i0, 64, %i0
4120 
4121 copyin_seg3:
4122         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4123         FALIGN_D6
4124         ldda    [%l7]ASI_BLK_AIUS, %d0
4125         stda    %d48, [%i0]ASI_BLK_P
4126         add     %l7, 64, %l7
4127         subcc   %i3, 64, %i3
4128         bz,pn   %ncc, 0f
4129         add     %i0, 64, %i0
4130         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4131         FALIGN_D22
4132         ldda    [%l7]ASI_BLK_AIUS, %d16
4133         stda    %d48, [%i0]ASI_BLK_P
4134         add     %l7, 64, %l7
4135         subcc   %i3, 64, %i3
4136         bz,pn   %ncc, 1f
4137         add     %i0, 64, %i0
4138         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4139         FALIGN_D38
4140         ldda    [%l7]ASI_BLK_AIUS, %d32
4141         stda    %d48, [%i0]ASI_BLK_P
4142         add     %l7, 64, %l7
4143         subcc   %i3, 64, %i3
4144         bz,pn   %ncc, 2f
4145         add     %i0, 64, %i0
4146         ba,a,pt %ncc, copyin_seg3
4147 
4148 0:
4149         FALIGN_D22
4150         stda    %d48, [%i0]ASI_BLK_P
4151         add     %i0, 64, %i0
4152         membar  #Sync
4153         FALIGN_D38
4154         stda    %d48, [%i0]ASI_BLK_P
4155         ba,pt   %ncc, copyin_blkd6
4156         add     %i0, 64, %i0
4157 
4158 1:
4159         FALIGN_D38
4160         stda    %d48, [%i0]ASI_BLK_P
4161         add     %i0, 64, %i0
4162         membar  #Sync
4163         FALIGN_D6
4164         stda    %d48, [%i0]ASI_BLK_P
4165         ba,pt   %ncc, copyin_blkd22
4166         add     %i0, 64, %i0
4167 
4168 2:
4169         FALIGN_D6
4170         stda    %d48, [%i0]ASI_BLK_P
4171         add     %i0, 64, %i0
4172         membar  #Sync
4173         FALIGN_D22
4174         stda    %d48, [%i0]ASI_BLK_P
4175         ba,pt   %ncc, copyin_blkd38
4176         add     %i0, 64, %i0
4177 
4178 copyin_seg4:
4179         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4180         FALIGN_D8
4181         ldda    [%l7]ASI_BLK_AIUS, %d0
4182         stda    %d48, [%i0]ASI_BLK_P
4183         add     %l7, 64, %l7
4184         subcc   %i3, 64, %i3
4185         bz,pn   %ncc, 0f
4186         add     %i0, 64, %i0
4187         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4188         FALIGN_D24
4189         ldda    [%l7]ASI_BLK_AIUS, %d16
4190         stda    %d48, [%i0]ASI_BLK_P
4191         add     %l7, 64, %l7
4192         subcc   %i3, 64, %i3
4193         bz,pn   %ncc, 1f
4194         add     %i0, 64, %i0
4195         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4196         FALIGN_D40
4197         ldda    [%l7]ASI_BLK_AIUS, %d32
4198         stda    %d48, [%i0]ASI_BLK_P
4199         add     %l7, 64, %l7
4200         subcc   %i3, 64, %i3
4201         bz,pn   %ncc, 2f
4202         add     %i0, 64, %i0
4203         ba,a,pt %ncc, copyin_seg4
4204 
4205 0:
4206         FALIGN_D24
4207         stda    %d48, [%i0]ASI_BLK_P
4208         add     %i0, 64, %i0
4209         membar  #Sync
4210         FALIGN_D40
4211         stda    %d48, [%i0]ASI_BLK_P
4212         ba,pt   %ncc, copyin_blkd8
4213         add     %i0, 64, %i0
4214 
4215 1:
4216         FALIGN_D40
4217         stda    %d48, [%i0]ASI_BLK_P
4218         add     %i0, 64, %i0
4219         membar  #Sync
4220         FALIGN_D8
4221         stda    %d48, [%i0]ASI_BLK_P
4222         ba,pt   %ncc, copyin_blkd24
4223         add     %i0, 64, %i0
4224 
4225 2:
4226         FALIGN_D8
4227         stda    %d48, [%i0]ASI_BLK_P
4228         add     %i0, 64, %i0
4229         membar  #Sync
4230         FALIGN_D24
4231         stda    %d48, [%i0]ASI_BLK_P
4232         ba,pt   %ncc, copyin_blkd40
4233         add     %i0, 64, %i0
4234 
4235 copyin_seg5:
4236         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4237         FALIGN_D10
4238         ldda    [%l7]ASI_BLK_AIUS, %d0
4239         stda    %d48, [%i0]ASI_BLK_P
4240         add     %l7, 64, %l7
4241         subcc   %i3, 64, %i3
4242         bz,pn   %ncc, 0f
4243         add     %i0, 64, %i0
4244         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4245         FALIGN_D26
4246         ldda    [%l7]ASI_BLK_AIUS, %d16
4247         stda    %d48, [%i0]ASI_BLK_P
4248         add     %l7, 64, %l7
4249         subcc   %i3, 64, %i3
4250         bz,pn   %ncc, 1f
4251         add     %i0, 64, %i0
4252         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4253         FALIGN_D42
4254         ldda    [%l7]ASI_BLK_AIUS, %d32
4255         stda    %d48, [%i0]ASI_BLK_P
4256         add     %l7, 64, %l7
4257         subcc   %i3, 64, %i3
4258         bz,pn   %ncc, 2f
4259         add     %i0, 64, %i0
4260         ba,a,pt %ncc, copyin_seg5
4261 
4262 0:
4263         FALIGN_D26
4264         stda    %d48, [%i0]ASI_BLK_P
4265         add     %i0, 64, %i0
4266         membar  #Sync
4267         FALIGN_D42
4268         stda    %d48, [%i0]ASI_BLK_P
4269         ba,pt   %ncc, copyin_blkd10
4270         add     %i0, 64, %i0
4271 
4272 1:
4273         FALIGN_D42
4274         stda    %d48, [%i0]ASI_BLK_P
4275         add     %i0, 64, %i0
4276         membar  #Sync
4277         FALIGN_D10
4278         stda    %d48, [%i0]ASI_BLK_P
4279         ba,pt   %ncc, copyin_blkd26
4280         add     %i0, 64, %i0
4281 
4282 2:
4283         FALIGN_D10
4284         stda    %d48, [%i0]ASI_BLK_P
4285         add     %i0, 64, %i0
4286         membar  #Sync
4287         FALIGN_D26
4288         stda    %d48, [%i0]ASI_BLK_P
4289         ba,pt   %ncc, copyin_blkd42
4290         add     %i0, 64, %i0
4291 
4292 copyin_seg6:
4293         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4294         FALIGN_D12
4295         ldda    [%l7]ASI_BLK_AIUS, %d0
4296         stda    %d48, [%i0]ASI_BLK_P
4297         add     %l7, 64, %l7
4298         subcc   %i3, 64, %i3
4299         bz,pn   %ncc, 0f
4300         add     %i0, 64, %i0
4301         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4302         FALIGN_D28
4303         ldda    [%l7]ASI_BLK_AIUS, %d16
4304         stda    %d48, [%i0]ASI_BLK_P
4305         add     %l7, 64, %l7
4306         subcc   %i3, 64, %i3
4307         bz,pn   %ncc, 1f
4308         add     %i0, 64, %i0
4309         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4310         FALIGN_D44
4311         ldda    [%l7]ASI_BLK_AIUS, %d32
4312         stda    %d48, [%i0]ASI_BLK_P
4313         add     %l7, 64, %l7
4314         subcc   %i3, 64, %i3
4315         bz,pn   %ncc, 2f
4316         add     %i0, 64, %i0
4317         ba,a,pt %ncc, copyin_seg6
4318 
4319 0:
4320         FALIGN_D28
4321         stda    %d48, [%i0]ASI_BLK_P
4322         add     %i0, 64, %i0
4323         membar  #Sync
4324         FALIGN_D44
4325         stda    %d48, [%i0]ASI_BLK_P
4326         ba,pt   %ncc, copyin_blkd12
4327         add     %i0, 64, %i0
4328 
4329 1:
4330         FALIGN_D44
4331         stda    %d48, [%i0]ASI_BLK_P
4332         add     %i0, 64, %i0
4333         membar  #Sync
4334         FALIGN_D12
4335         stda    %d48, [%i0]ASI_BLK_P
4336         ba,pt   %ncc, copyin_blkd28
4337         add     %i0, 64, %i0
4338 
4339 2:
4340         FALIGN_D12
4341         stda    %d48, [%i0]ASI_BLK_P
4342         add     %i0, 64, %i0
4343         membar  #Sync
4344         FALIGN_D28
4345         stda    %d48, [%i0]ASI_BLK_P
4346         ba,pt   %ncc, copyin_blkd44
4347         add     %i0, 64, %i0
4348 
4349 copyin_seg7:
4350         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4351         FALIGN_D14
4352         ldda    [%l7]ASI_BLK_AIUS, %d0
4353         stda    %d48, [%i0]ASI_BLK_P
4354         add     %l7, 64, %l7
4355         subcc   %i3, 64, %i3
4356         bz,pn   %ncc, 0f
4357         add     %i0, 64, %i0
4358         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4359         FALIGN_D30
4360         ldda    [%l7]ASI_BLK_AIUS, %d16
4361         stda    %d48, [%i0]ASI_BLK_P
4362         add     %l7, 64, %l7
4363         subcc   %i3, 64, %i3
4364         bz,pn   %ncc, 1f
4365         add     %i0, 64, %i0
4366         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4367         FALIGN_D46
4368         ldda    [%l7]ASI_BLK_AIUS, %d32
4369         stda    %d48, [%i0]ASI_BLK_P
4370         add     %l7, 64, %l7
4371         subcc   %i3, 64, %i3
4372         bz,pn   %ncc, 2f
4373         add     %i0, 64, %i0
4374         ba,a,pt %ncc, copyin_seg7
4375 
4376 0:
4377         FALIGN_D30
4378         stda    %d48, [%i0]ASI_BLK_P
4379         add     %i0, 64, %i0
4380         membar  #Sync
4381         FALIGN_D46
4382         stda    %d48, [%i0]ASI_BLK_P
4383         ba,pt   %ncc, copyin_blkd14
4384         add     %i0, 64, %i0
4385 
4386 1:
4387         FALIGN_D46
4388         stda    %d48, [%i0]ASI_BLK_P
4389         add     %i0, 64, %i0
4390         membar  #Sync
4391         FALIGN_D14
4392         stda    %d48, [%i0]ASI_BLK_P
4393         ba,pt   %ncc, copyin_blkd30
4394         add     %i0, 64, %i0
4395 
4396 2:
4397         FALIGN_D14
4398         stda    %d48, [%i0]ASI_BLK_P
4399         add     %i0, 64, %i0
4400         membar  #Sync
4401         FALIGN_D30
4402         stda    %d48, [%i0]ASI_BLK_P
4403         ba,pt   %ncc, copyin_blkd46
4404         add     %i0, 64, %i0
4405 
4406 
4407         !
4408         ! dribble out the last partial block
4409         !
4410 copyin_blkd0:
4411         subcc   %i4, 8, %i4
4412         blu,pn  %ncc, copyin_blkdone
4413         faligndata %d0, %d2, %d48
4414         std     %d48, [%i0]
4415         add     %i0, 8, %i0
4416 copyin_blkd2:
4417         subcc   %i4, 8, %i4
4418         blu,pn  %ncc, copyin_blkdone
4419         faligndata %d2, %d4, %d48
4420         std     %d48, [%i0]
4421         add     %i0, 8, %i0
4422 copyin_blkd4:
4423         subcc   %i4, 8, %i4
4424         blu,pn  %ncc, copyin_blkdone
4425         faligndata %d4, %d6, %d48
4426         std     %d48, [%i0]
4427         add     %i0, 8, %i0
4428 copyin_blkd6:
4429         subcc   %i4, 8, %i4
4430         blu,pn  %ncc, copyin_blkdone
4431         faligndata %d6, %d8, %d48
4432         std     %d48, [%i0]
4433         add     %i0, 8, %i0
4434 copyin_blkd8:
4435         subcc   %i4, 8, %i4
4436         blu,pn  %ncc, copyin_blkdone
4437         faligndata %d8, %d10, %d48
4438         std     %d48, [%i0]
4439         add     %i0, 8, %i0
4440 copyin_blkd10:
4441         subcc   %i4, 8, %i4
4442         blu,pn  %ncc, copyin_blkdone
4443         faligndata %d10, %d12, %d48
4444         std     %d48, [%i0]
4445         add     %i0, 8, %i0
4446 copyin_blkd12:
4447         subcc   %i4, 8, %i4
4448         blu,pn  %ncc, copyin_blkdone
4449         faligndata %d12, %d14, %d48
4450         std     %d48, [%i0]
4451         add     %i0, 8, %i0
4452 copyin_blkd14:
4453         subcc   %i4, 8, %i4
4454         blu,pn  %ncc, copyin_blkdone
4455         fsrc1   %d14, %d0
4456         ba,a,pt %ncc, copyin_blkleft
4457 
4458 copyin_blkd16:
4459         subcc   %i4, 8, %i4
4460         blu,pn  %ncc, copyin_blkdone
4461         faligndata %d16, %d18, %d48
4462         std     %d48, [%i0]
4463         add     %i0, 8, %i0
4464 copyin_blkd18:
4465         subcc   %i4, 8, %i4
4466         blu,pn  %ncc, copyin_blkdone
4467         faligndata %d18, %d20, %d48
4468         std     %d48, [%i0]
4469         add     %i0, 8, %i0
4470 copyin_blkd20:
4471         subcc   %i4, 8, %i4
4472         blu,pn  %ncc, copyin_blkdone
4473         faligndata %d20, %d22, %d48
4474         std     %d48, [%i0]
4475         add     %i0, 8, %i0
4476 copyin_blkd22:
4477         subcc   %i4, 8, %i4
4478         blu,pn  %ncc, copyin_blkdone
4479         faligndata %d22, %d24, %d48
4480         std     %d48, [%i0]
4481         add     %i0, 8, %i0
4482 copyin_blkd24:
4483         subcc   %i4, 8, %i4
4484         blu,pn  %ncc, copyin_blkdone
4485         faligndata %d24, %d26, %d48
4486         std     %d48, [%i0]
4487         add     %i0, 8, %i0
4488 copyin_blkd26:
4489         subcc   %i4, 8, %i4
4490         blu,pn  %ncc, copyin_blkdone
4491         faligndata %d26, %d28, %d48
4492         std     %d48, [%i0]
4493         add     %i0, 8, %i0
4494 copyin_blkd28:
4495         subcc   %i4, 8, %i4
4496         blu,pn  %ncc, copyin_blkdone
4497         faligndata %d28, %d30, %d48
4498         std     %d48, [%i0]
4499         add     %i0, 8, %i0
4500 copyin_blkd30:
4501         subcc   %i4, 8, %i4
4502         blu,pn  %ncc, copyin_blkdone
4503         fsrc1   %d30, %d0
4504         ba,a,pt %ncc, copyin_blkleft
4505 copyin_blkd32:
4506         subcc   %i4, 8, %i4
4507         blu,pn  %ncc, copyin_blkdone
4508         faligndata %d32, %d34, %d48
4509         std     %d48, [%i0]
4510         add     %i0, 8, %i0
4511 copyin_blkd34:
4512         subcc   %i4, 8, %i4
4513         blu,pn  %ncc, copyin_blkdone
4514         faligndata %d34, %d36, %d48
4515         std     %d48, [%i0]
4516         add     %i0, 8, %i0
4517 copyin_blkd36:
4518         subcc   %i4, 8, %i4
4519         blu,pn  %ncc, copyin_blkdone
4520         faligndata %d36, %d38, %d48
4521         std     %d48, [%i0]
4522         add     %i0, 8, %i0
4523 copyin_blkd38:
4524         subcc   %i4, 8, %i4
4525         blu,pn  %ncc, copyin_blkdone
4526         faligndata %d38, %d40, %d48
4527         std     %d48, [%i0]
4528         add     %i0, 8, %i0
4529 copyin_blkd40:
4530         subcc   %i4, 8, %i4
4531         blu,pn  %ncc, copyin_blkdone
4532         faligndata %d40, %d42, %d48
4533         std     %d48, [%i0]
4534         add     %i0, 8, %i0
4535 copyin_blkd42:
4536         subcc   %i4, 8, %i4
4537         blu,pn  %ncc, copyin_blkdone
4538         faligndata %d42, %d44, %d48
4539         std     %d48, [%i0]
4540         add     %i0, 8, %i0
4541 copyin_blkd44:
4542         subcc   %i4, 8, %i4
4543         blu,pn  %ncc, copyin_blkdone
4544         faligndata %d44, %d46, %d48
4545         std     %d48, [%i0]
4546         add     %i0, 8, %i0
4547 copyin_blkd46:
4548         subcc   %i4, 8, %i4
4549         blu,pn  %ncc, copyin_blkdone
4550         fsrc1   %d46, %d0
4551 
4552 copyin_blkleft:
4553 1:
4554         ldda    [%l7]ASI_USER, %d2
4555         add     %l7, 8, %l7
4556         subcc   %i4, 8, %i4
4557         faligndata %d0, %d2, %d8
4558         std     %d8, [%i0]
4559         blu,pn  %ncc, copyin_blkdone
4560         add     %i0, 8, %i0
4561         ldda    [%l7]ASI_USER, %d0
4562         add     %l7, 8, %l7
4563         subcc   %i4, 8, %i4
4564         faligndata %d2, %d0, %d8
4565         std     %d8, [%i0]
4566         bgeu,pt %ncc, 1b
4567         add     %i0, 8, %i0
4568 
4569 copyin_blkdone:
4570         tst     %i2
4571         bz,pt   %ncc, .copyin_exit
4572         and     %l3, 0x4, %l3           ! fprs.du = fprs.dl = 0
4573 
4574 7:      lduba   [%i1]ASI_USER, %i4
4575         inc     %i1
4576         inc     %i0
4577         deccc   %i2
4578         bgu     %ncc, 7b
4579           stb     %i4, [%i0 - 1]
4580 
4581 .copyin_exit:
4582         membar  #StoreLoad|#StoreStore
4583         btst    FPUSED_FLAG, SAVED_LOFAULT
4584         bz      %icc, 1f
4585           nop
4586 
4587         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
4588         wr      %o2, 0, %gsr
4589 
4590         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
4591         btst    FPRS_FEF, %o3
4592         bz      %icc, 4f
4593           nop
4594 
4595         ! restore fpregs from stack
4596         membar  #Sync
4597         add     %fp, STACK_BIAS - 257, %o2
4598         and     %o2, -64, %o2
4599         ldda    [%o2]ASI_BLK_P, %d0
4600         add     %o2, 64, %o2
4601         ldda    [%o2]ASI_BLK_P, %d16
4602         add     %o2, 64, %o2
4603         ldda    [%o2]ASI_BLK_P, %d32
4604         add     %o2, 64, %o2
4605         ldda    [%o2]ASI_BLK_P, %d48
4606         membar  #Sync
4607 
4608         ba,pt   %ncc, 1f
4609           wr    %o3, 0, %fprs           ! restore fprs
4610 
4611 4:
4612         FZERO                           ! zero all of the fpregs
4613         wr      %o3, 0, %fprs           ! restore fprs
4614 
4615 1:
4616         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4617         membar  #Sync                           ! sync error barrier
4618         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4619         ret
4620         restore %g0, 0, %o0
4621 .copyin_err:
4622         ldn     [THREAD_REG + T_COPYOPS], %o4
4623         brz     %o4, 2f
4624         nop
4625         ldn     [%o4 + CP_COPYIN], %g2
4626         jmp     %g2
4627         nop
4628 2:
4629         retl
4630         mov     -1, %o0
4631         SET_SIZE(copyin)
4632 
4633 #endif  /* lint */
4634 
4635 #ifdef  lint
4636 
4637 /*ARGSUSED*/
4638 int
4639 xcopyin(const void *uaddr, void *kaddr, size_t count)
4640 { return (0); }
4641 
4642 #else   /* lint */
4643 
4644         ENTRY(xcopyin)
4645         sethi   %hi(.xcopyin_err), REAL_LOFAULT
4646         b       .do_copyin
4647           or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
4648 .xcopyin_err:
4649         ldn     [THREAD_REG + T_COPYOPS], %o4
4650         brz     %o4, 2f
4651         nop
4652         ldn     [%o4 + CP_XCOPYIN], %g2
4653         jmp     %g2
4654         nop
4655 2:
4656         retl
4657         mov     %g1, %o0
4658         SET_SIZE(xcopyin)
4659 
4660 #endif  /* lint */
4661 
4662 #ifdef  lint
4663 
4664 /*ARGSUSED*/
4665 int
4666 xcopyin_little(const void *uaddr, void *kaddr, size_t count)
4667 { return (0); }
4668 
4669 #else   /* lint */
4670 
4671         ENTRY(xcopyin_little)
4672         sethi   %hi(.little_err), %o4
4673         ldn     [THREAD_REG + T_LOFAULT], %o5
4674         or      %o4, %lo(.little_err), %o4
4675         membar  #Sync                           ! sync error barrier
4676         stn     %o4, [THREAD_REG + T_LOFAULT]   
4677 
4678         subcc   %g0, %o2, %o3
4679         add     %o0, %o2, %o0
4680         bz,pn   %ncc, 2f                ! check for zero bytes
4681         sub     %o2, 1, %o4
4682         add     %o0, %o4, %o0           ! start w/last byte     
4683         add     %o1, %o2, %o1
4684         lduba   [%o0+%o3]ASI_AIUSL, %o4
4685 
4686 1:      stb     %o4, [%o1+%o3]
4687         inccc   %o3
4688         sub     %o0, 2, %o0             ! get next byte
4689         bcc,a,pt %ncc, 1b
4690           lduba [%o0+%o3]ASI_AIUSL, %o4
4691 
4692 2:      membar  #Sync                           ! sync error barrier
4693         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
4694         retl
4695         mov     %g0, %o0                ! return (0)
4696 
4697 .little_err:
4698         membar  #Sync                           ! sync error barrier
4699         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
4700         retl
4701         mov     %g1, %o0
4702         SET_SIZE(xcopyin_little)
4703 
4704 #endif  /* lint */
4705 
4706 
4707 /*
4708  * Copy a block of storage - must not overlap (from + len <= to).
4709  * No fault handler installed (to be called under on_fault())
4710  */
4711 #if defined(lint)
4712 
4713 /* ARGSUSED */
4714 void
4715 copyin_noerr(const void *ufrom, void *kto, size_t count)
4716 {}
4717 
4718 #else   /* lint */
4719 
4720         ENTRY(copyin_noerr)
4721         sethi   %hi(.copyio_noerr), REAL_LOFAULT
4722         b       .do_copyin
4723           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4724 .copyio_noerr:
4725         jmp     SAVED_LOFAULT
4726           nop
4727         SET_SIZE(copyin_noerr)
4728 
4729 #endif /* lint */
4730 
4731 /*
4732  * Copy a block of storage - must not overlap (from + len <= to).
4733  * No fault handler installed (to be called under on_fault())
4734  */
4735 
4736 #if defined(lint)
4737 
4738 /* ARGSUSED */
4739 void
4740 copyout_noerr(const void *kfrom, void *uto, size_t count)
4741 {}
4742 
4743 #else   /* lint */
4744 
4745         ENTRY(copyout_noerr)
4746         sethi   %hi(.copyio_noerr), REAL_LOFAULT
4747         b       .do_copyout
4748           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4749         SET_SIZE(copyout_noerr)
4750 
4751 #endif /* lint */
4752 
4753 #if defined(lint)
4754 
4755 int use_hw_bcopy = 1;
4756 int use_hw_copyio = 1;
4757 int use_hw_bzero = 1;
4758 uint_t hw_copy_limit_1 = 0;
4759 uint_t hw_copy_limit_2 = 0;
4760 uint_t hw_copy_limit_4 = 0;
4761 uint_t hw_copy_limit_8 = 0;
4762 
4763 #else /* !lint */
4764 
4765         .align  4
4766         DGDEF(use_hw_bcopy)
4767         .word   1
4768         DGDEF(use_hw_copyio)
4769         .word   1
4770         DGDEF(use_hw_bzero)
4771         .word   1
4772         DGDEF(hw_copy_limit_1)
4773         .word   0
4774         DGDEF(hw_copy_limit_2)
4775         .word   0
4776         DGDEF(hw_copy_limit_4)
4777         .word   0
4778         DGDEF(hw_copy_limit_8)
4779         .word   0
4780 
4781         .align  64
4782         .section ".text"
4783 #endif /* !lint */
4784 
4785 
4786 /*
4787  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
4788  * longer than 256 bytes in length using spitfire's block stores.  If
4789  * the criteria for using this routine are not met then it calls bzero
4790  * and returns 1.  Otherwise 0 is returned indicating success.
4791  * Caller is responsible for ensuring use_hw_bzero is true and that
4792  * kpreempt_disable() has been called.
4793  */
4794 #ifdef lint
4795 /*ARGSUSED*/
4796 int
4797 hwblkclr(void *addr, size_t len)
4798 { 
4799         return(0);
4800 }
4801 #else /* lint */
4802         ! %i0 - start address
4803         ! %i1 - length of region (multiple of 64)
4804         ! %l0 - saved fprs
4805         ! %l1 - pointer to saved %d0 block
4806         ! %l2 - saved curthread->t_lwp
4807 
4808         ENTRY(hwblkclr)
4809         ! get another window w/space for one aligned block of saved fpregs
4810         save    %sp, -SA(MINFRAME + 2*64), %sp
4811 
4812         ! Must be block-aligned
4813         andcc   %i0, (64-1), %g0
4814         bnz,pn  %ncc, 1f
4815           nop
4816 
4817         ! ... and must be 256 bytes or more
4818         cmp     %i1, 256
4819         blu,pn  %ncc, 1f
4820           nop
4821 
4822         ! ... and length must be a multiple of 64
4823         andcc   %i1, (64-1), %g0
4824         bz,pn   %ncc, 2f
4825           nop
4826 
4827 1:      ! punt, call bzero but notify the caller that bzero was used
4828         mov     %i0, %o0
4829         call    bzero
4830           mov   %i1, %o1
4831         ret
4832         restore %g0, 1, %o0     ! return (1) - did not use block operations
4833 
4834 2:      rd      %fprs, %l0              ! check for unused fp
4835         btst    FPRS_FEF, %l0
4836         bz      1f
4837           nop
4838 
4839         ! save in-use fpregs on stack
4840         membar  #Sync
4841         add     %fp, STACK_BIAS - 65, %l1
4842         and     %l1, -64, %l1
4843         stda    %d0, [%l1]ASI_BLK_P
4844 
4845 1:      membar  #StoreStore|#StoreLoad|#LoadStore
4846         wr      %g0, FPRS_FEF, %fprs
4847         wr      %g0, ASI_BLK_P, %asi
4848 
4849         ! Clear block
4850         fzero   %d0
4851         fzero   %d2
4852         fzero   %d4
4853         fzero   %d6
4854         fzero   %d8
4855         fzero   %d10
4856         fzero   %d12
4857         fzero   %d14
4858 
4859         mov     256, %i3
4860         ba      .pz_doblock
4861           nop
4862 
4863 .pz_blkstart:   
4864       ! stda    %d0, [%i0+192]%asi  ! in dly slot of branch that got us here
4865         stda    %d0, [%i0+128]%asi
4866         stda    %d0, [%i0+64]%asi
4867         stda    %d0, [%i0]%asi
4868 .pz_zinst:
4869         add     %i0, %i3, %i0
4870         sub     %i1, %i3, %i1
4871 .pz_doblock:
4872         cmp     %i1, 256
4873         bgeu,a  %ncc, .pz_blkstart
4874           stda  %d0, [%i0+192]%asi
4875 
4876         cmp     %i1, 64
4877         blu     %ncc, .pz_finish
4878         
4879         andn    %i1, (64-1), %i3
4880         srl     %i3, 4, %i2             ! using blocks, 1 instr / 16 words
4881         set     .pz_zinst, %i4
4882         sub     %i4, %i2, %i4
4883         jmp     %i4
4884           nop
4885 
4886 .pz_finish:
4887         membar  #Sync
4888         btst    FPRS_FEF, %l0
4889         bz,a    .pz_finished
4890           wr    %l0, 0, %fprs           ! restore fprs
4891 
4892         ! restore fpregs from stack
4893         ldda    [%l1]ASI_BLK_P, %d0
4894         membar  #Sync
4895         wr      %l0, 0, %fprs           ! restore fprs
4896 
4897 .pz_finished:
4898         ret
4899         restore %g0, 0, %o0             ! return (bzero or not)
4900         SET_SIZE(hwblkclr)
4901 #endif  /* lint */
4902 
4903 #ifdef  lint
4904 /* Copy 32 bytes of data from src to dst using physical addresses */
4905 /*ARGSUSED*/
4906 void
4907 hw_pa_bcopy32(uint64_t src, uint64_t dst)
4908 {}
4909 #else   /*!lint */
4910 
4911         /*
4912          * Copy 32 bytes of data from src (%o0) to dst (%o1)
4913          * using physical addresses.
4914          */
4915         ENTRY_NP(hw_pa_bcopy32)
4916         rdpr    %pstate, %g1
4917         andn    %g1, PSTATE_IE, %g2
4918         wrpr    %g0, %g2, %pstate
4919 
4920         ldxa    [%o0]ASI_MEM, %o2
4921         add     %o0, 8, %o0
4922         ldxa    [%o0]ASI_MEM, %o3
4923         add     %o0, 8, %o0
4924         ldxa    [%o0]ASI_MEM, %o4
4925         add     %o0, 8, %o0
4926         ldxa    [%o0]ASI_MEM, %o5
4927         stxa    %o2, [%o1]ASI_MEM
4928         add     %o1, 8, %o1
4929         stxa    %o3, [%o1]ASI_MEM
4930         add     %o1, 8, %o1
4931         stxa    %o4, [%o1]ASI_MEM
4932         add     %o1, 8, %o1
4933         stxa    %o5, [%o1]ASI_MEM
4934 
4935         membar  #Sync
4936         retl
4937           wrpr    %g0, %g1, %pstate
4938         SET_SIZE(hw_pa_bcopy32)
4939 #endif /* lint */