1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/param.h>
  28 #include <sys/errno.h>
  29 #include <sys/asm_linkage.h>
  30 #include <sys/vtrace.h>
  31 #include <sys/machthread.h>
  32 #include <sys/clock.h>
  33 #include <sys/asi.h>
  34 #include <sys/fsr.h>
  35 #include <sys/privregs.h>
  36 
  37 #include "assym.h"
  38 
  39 
  40 /*
  41  * Pseudo-code to aid in understanding the control flow of the
  42  * bcopy routine.
  43  *
  44  * On entry to bcopy:
  45  *
  46  *      %l6 = curthread->t_lofault;
  47  *      used_block_copy = FALSE;                        ! %l6 |= 1
  48  *      if (%l6 != NULL) {
  49  *              curthread->t_lofault = .copyerr;
  50  *              caller_error_handler = TRUE             ! %l6 |= 2
  51  *      }
  52  *
  53  *      if (length < VIS_COPY)
  54  *              goto regular_copy;
  55  *
  56  *      if (!use_vis)
  57  *              goto_regular_copy;
  58  *
  59  *      if (curthread->t_lwp == NULL) {
  60  *              ! Kernel threads do not have pcb's in which to store
  61  *              ! the floating point state, disallow preemption during
  62  *              ! the copy.
  63  *              kpreempt_disable(curthread);
  64  *      }
  65  *
  66  *      old_fprs = %fprs;
  67  *      old_gsr = %gsr;
  68  *      if (%fprs.fef) {
  69  *              ! If we need to save 4 blocks of fpregs then make sure
  70  *              ! the length is still appropriate for that extra overhead.
  71  *              if (length < (large_length + (64 * 4))) {
  72  *                      if (curthread->t_lwp == NULL)
  73  *                              kpreempt_enable(curthread);
  74  *                      goto regular_copy;
  75  *              }
  76  *              %fprs.fef = 1;
  77  *              save current fpregs on stack using blockstore
  78  *      } else {
  79  *              %fprs.fef = 1;
  80  *      }
  81  *
  82  *      used_block_copy = 1;                            ! %l6 |= 1
  83  *      do_blockcopy_here;
  84  *
  85  * In lofault handler:
  86  *      curthread->t_lofault = .copyerr2;
  87  *      Continue on with the normal exit handler
  88  *
  89  * On exit:
  90  *      call_kpreempt = 0;
  91  *      if (used_block_copy) {                          ! %l6 & 1
  92  *              %gsr = old_gsr;
  93  *              if (old_fprs & FPRS_FEF)
  94  *                      restore fpregs from stack using blockload
  95  *              else
  96  *                      zero fpregs
  97  *              %fprs = old_fprs;
  98  *              if (curthread->t_lwp == NULL) {
  99  *                      kpreempt_enable(curthread);
 100  *                      call_kpreempt = 1;
 101  *              }
 102  *      }
 103  *      curthread->t_lofault = (%l6 & ~3);
 104  *      if (call_kpreempt)
 105  *              kpreempt(%pil);
 106  *      return (0)
 107  *
 108  * In second lofault handler (.copyerr2):
 109  *      We've tried to restore fp state from the stack and failed.  To
 110  *      prevent from returning with a corrupted fp state, we will panic.
 111  */
 112 
 113 /*
 114  * Notes on preserving existing fp state:
 115  *
 116  * When a copyOP decides to use fp we may have to preserve existing
 117  * floating point state.  It is not the caller's state that we need to
 118  * preserve - the rest of the kernel does not use fp and, anyway, fp
 119  * registers are volatile across a call.  Some examples:
 120  *
 121  *      - userland has fp state and is interrupted (device interrupt
 122  *        or trap) and within the interrupt/trap handling we use
 123  *        bcopy()
 124  *      - another (higher level) interrupt or trap handler uses bcopy
 125  *        while a bcopy from an earlier interrupt is still active
 126  *      - an asynchronous error trap occurs while fp state exists (in
 127  *        userland or in kernel copy) and the tl0 component of the handling
 128  *        uses bcopy
 129  *      - a user process with fp state incurs a copy-on-write fault and
 130  *        hwblkpagecopy always uses fp
 131  *
 132  * We therefore need a per-call place in which to preserve fp state -
 133  * using our stack is ideal (and since fp copy cannot be leaf optimized
 134  * because of calls it makes, this is no hardship).
 135  *
 136  * To make sure that floating point state is always saved and restored
 137  * correctly, the following "big rules" must be followed when the floating
 138  * point registers will be used:
 139  *
 140  * 1. %l6 always holds the caller's lofault handler.  Also in this register,
 141  *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
 142  *    use.  Bit 2 (BCOPY_FLAG) indicates that the call was to bcopy.
 143  *
 144  * 2. The FPUSED flag indicates that all FP state has been successfully stored
 145  *    on the stack.  It should not be set until this save has been completed.
 146  *
 147  * 3. The FPUSED flag should not be cleared on exit until all FP state has
 148  *    been restored from the stack.  If an error occurs while restoring
 149  *    data from the stack, the error handler can check this flag to see if
 150  *    a restore is necessary.
 151  *
 152  * 4. Code run under the new lofault handler must be kept to a minimum.  In
 153  *    particular, any calls to kpreempt() should not be made until after the
 154  *    lofault handler has been restored.
 155  */
 156 
 157 /*
 158  * This shadows sys/machsystm.h which can't be included due to the lack of
 159  * _ASM guards in include files it references. Change it here, change it there.
 160  */
 161 #define VIS_COPY_THRESHOLD 900
 162 
 163 /*
 164  * Less then or equal this number of bytes we will always copy byte-for-byte
 165  */
 166 #define SMALL_LIMIT     7
 167 
 168 /*
 169  * Flags set in the lower bits of the t_lofault address:
 170  * FPUSED_FLAG: The FP registers were in use and must be restored
 171  * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
 172  * COPY_FLAGS: Both of the above
 173  *
 174  * Other flags:
 175  * KPREEMPT_FLAG: kpreempt needs to be called
 176  */
 177 #define FPUSED_FLAG     1
 178 #define BCOPY_FLAG      2
 179 #define COPY_FLAGS      (FPUSED_FLAG | BCOPY_FLAG)
 180 #define KPREEMPT_FLAG   4
 181 
 182 /*
 183  * Size of stack frame in order to accomodate a 64-byte aligned
 184  * floating-point register save area and 2 32-bit temp locations.
 185  */
 186 #define HWCOPYFRAMESIZE ((64 * 5) + (2 * 4))
 187 
 188 #define SAVED_FPREGS_OFFSET     (64 * 5)
 189 #define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 4)
 190 #define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 4)
 191 
 192 /*
 193  * Common macros used by the various versions of the block copy
 194  * routines in this file.
 195  */
 196 
 197 #define FZERO                           \
 198         fzero   %f0                     ;\
 199         fzero   %f2                     ;\
 200         faddd   %f0, %f2, %f4           ;\
 201         fmuld   %f0, %f2, %f6           ;\
 202         faddd   %f0, %f2, %f8           ;\
 203         fmuld   %f0, %f2, %f10          ;\
 204         faddd   %f0, %f2, %f12          ;\
 205         fmuld   %f0, %f2, %f14          ;\
 206         faddd   %f0, %f2, %f16          ;\
 207         fmuld   %f0, %f2, %f18          ;\
 208         faddd   %f0, %f2, %f20          ;\
 209         fmuld   %f0, %f2, %f22          ;\
 210         faddd   %f0, %f2, %f24          ;\
 211         fmuld   %f0, %f2, %f26          ;\
 212         faddd   %f0, %f2, %f28          ;\
 213         fmuld   %f0, %f2, %f30          ;\
 214         faddd   %f0, %f2, %f32          ;\
 215         fmuld   %f0, %f2, %f34          ;\
 216         faddd   %f0, %f2, %f36          ;\
 217         fmuld   %f0, %f2, %f38          ;\
 218         faddd   %f0, %f2, %f40          ;\
 219         fmuld   %f0, %f2, %f42          ;\
 220         faddd   %f0, %f2, %f44          ;\
 221         fmuld   %f0, %f2, %f46          ;\
 222         faddd   %f0, %f2, %f48          ;\
 223         fmuld   %f0, %f2, %f50          ;\
 224         faddd   %f0, %f2, %f52          ;\
 225         fmuld   %f0, %f2, %f54          ;\
 226         faddd   %f0, %f2, %f56          ;\
 227         fmuld   %f0, %f2, %f58          ;\
 228         faddd   %f0, %f2, %f60          ;\
 229         fmuld   %f0, %f2, %f62
 230 
 231 
 232 #define FALIGN_D0                       \
 233         faligndata %d0, %d2, %d48       ;\
 234         faligndata %d2, %d4, %d50       ;\
 235         faligndata %d4, %d6, %d52       ;\
 236         faligndata %d6, %d8, %d54       ;\
 237         faligndata %d8, %d10, %d56      ;\
 238         faligndata %d10, %d12, %d58     ;\
 239         faligndata %d12, %d14, %d60     ;\
 240         faligndata %d14, %d16, %d62
 241 
 242 #define FALIGN_D16                      \
 243         faligndata %d16, %d18, %d48     ;\
 244         faligndata %d18, %d20, %d50     ;\
 245         faligndata %d20, %d22, %d52     ;\
 246         faligndata %d22, %d24, %d54     ;\
 247         faligndata %d24, %d26, %d56     ;\
 248         faligndata %d26, %d28, %d58     ;\
 249         faligndata %d28, %d30, %d60     ;\
 250         faligndata %d30, %d32, %d62
 251 
 252 #define FALIGN_D32                      \
 253         faligndata %d32, %d34, %d48     ;\
 254         faligndata %d34, %d36, %d50     ;\
 255         faligndata %d36, %d38, %d52     ;\
 256         faligndata %d38, %d40, %d54     ;\
 257         faligndata %d40, %d42, %d56     ;\
 258         faligndata %d42, %d44, %d58     ;\
 259         faligndata %d44, %d46, %d60     ;\
 260         faligndata %d46, %d0, %d62
 261 
 262 #define FALIGN_D2                       \
 263         faligndata %d2, %d4, %d48       ;\
 264         faligndata %d4, %d6, %d50       ;\
 265         faligndata %d6, %d8, %d52       ;\
 266         faligndata %d8, %d10, %d54      ;\
 267         faligndata %d10, %d12, %d56     ;\
 268         faligndata %d12, %d14, %d58     ;\
 269         faligndata %d14, %d16, %d60     ;\
 270         faligndata %d16, %d18, %d62
 271 
 272 #define FALIGN_D18                      \
 273         faligndata %d18, %d20, %d48     ;\
 274         faligndata %d20, %d22, %d50     ;\
 275         faligndata %d22, %d24, %d52     ;\
 276         faligndata %d24, %d26, %d54     ;\
 277         faligndata %d26, %d28, %d56     ;\
 278         faligndata %d28, %d30, %d58     ;\
 279         faligndata %d30, %d32, %d60     ;\
 280         faligndata %d32, %d34, %d62
 281 
 282 #define FALIGN_D34                      \
 283         faligndata %d34, %d36, %d48     ;\
 284         faligndata %d36, %d38, %d50     ;\
 285         faligndata %d38, %d40, %d52     ;\
 286         faligndata %d40, %d42, %d54     ;\
 287         faligndata %d42, %d44, %d56     ;\
 288         faligndata %d44, %d46, %d58     ;\
 289         faligndata %d46, %d0, %d60      ;\
 290         faligndata %d0, %d2, %d62
 291 
 292 #define FALIGN_D4                       \
 293         faligndata %d4, %d6, %d48       ;\
 294         faligndata %d6, %d8, %d50       ;\
 295         faligndata %d8, %d10, %d52      ;\
 296         faligndata %d10, %d12, %d54     ;\
 297         faligndata %d12, %d14, %d56     ;\
 298         faligndata %d14, %d16, %d58     ;\
 299         faligndata %d16, %d18, %d60     ;\
 300         faligndata %d18, %d20, %d62
 301 
 302 #define FALIGN_D20                      \
 303         faligndata %d20, %d22, %d48     ;\
 304         faligndata %d22, %d24, %d50     ;\
 305         faligndata %d24, %d26, %d52     ;\
 306         faligndata %d26, %d28, %d54     ;\
 307         faligndata %d28, %d30, %d56     ;\
 308         faligndata %d30, %d32, %d58     ;\
 309         faligndata %d32, %d34, %d60     ;\
 310         faligndata %d34, %d36, %d62
 311 
 312 #define FALIGN_D36                      \
 313         faligndata %d36, %d38, %d48     ;\
 314         faligndata %d38, %d40, %d50     ;\
 315         faligndata %d40, %d42, %d52     ;\
 316         faligndata %d42, %d44, %d54     ;\
 317         faligndata %d44, %d46, %d56     ;\
 318         faligndata %d46, %d0, %d58      ;\
 319         faligndata %d0, %d2, %d60       ;\
 320         faligndata %d2, %d4, %d62
 321 
 322 #define FALIGN_D6                       \
 323         faligndata %d6, %d8, %d48       ;\
 324         faligndata %d8, %d10, %d50      ;\
 325         faligndata %d10, %d12, %d52     ;\
 326         faligndata %d12, %d14, %d54     ;\
 327         faligndata %d14, %d16, %d56     ;\
 328         faligndata %d16, %d18, %d58     ;\
 329         faligndata %d18, %d20, %d60     ;\
 330         faligndata %d20, %d22, %d62
 331 
 332 #define FALIGN_D22                      \
 333         faligndata %d22, %d24, %d48     ;\
 334         faligndata %d24, %d26, %d50     ;\
 335         faligndata %d26, %d28, %d52     ;\
 336         faligndata %d28, %d30, %d54     ;\
 337         faligndata %d30, %d32, %d56     ;\
 338         faligndata %d32, %d34, %d58     ;\
 339         faligndata %d34, %d36, %d60     ;\
 340         faligndata %d36, %d38, %d62
 341 
 342 #define FALIGN_D38                      \
 343         faligndata %d38, %d40, %d48     ;\
 344         faligndata %d40, %d42, %d50     ;\
 345         faligndata %d42, %d44, %d52     ;\
 346         faligndata %d44, %d46, %d54     ;\
 347         faligndata %d46, %d0, %d56      ;\
 348         faligndata %d0, %d2, %d58       ;\
 349         faligndata %d2, %d4, %d60       ;\
 350         faligndata %d4, %d6, %d62
 351 
 352 #define FALIGN_D8                       \
 353         faligndata %d8, %d10, %d48      ;\
 354         faligndata %d10, %d12, %d50     ;\
 355         faligndata %d12, %d14, %d52     ;\
 356         faligndata %d14, %d16, %d54     ;\
 357         faligndata %d16, %d18, %d56     ;\
 358         faligndata %d18, %d20, %d58     ;\
 359         faligndata %d20, %d22, %d60     ;\
 360         faligndata %d22, %d24, %d62
 361 
 362 #define FALIGN_D24                      \
 363         faligndata %d24, %d26, %d48     ;\
 364         faligndata %d26, %d28, %d50     ;\
 365         faligndata %d28, %d30, %d52     ;\
 366         faligndata %d30, %d32, %d54     ;\
 367         faligndata %d32, %d34, %d56     ;\
 368         faligndata %d34, %d36, %d58     ;\
 369         faligndata %d36, %d38, %d60     ;\
 370         faligndata %d38, %d40, %d62
 371 
 372 #define FALIGN_D40                      \
 373         faligndata %d40, %d42, %d48     ;\
 374         faligndata %d42, %d44, %d50     ;\
 375         faligndata %d44, %d46, %d52     ;\
 376         faligndata %d46, %d0, %d54      ;\
 377         faligndata %d0, %d2, %d56       ;\
 378         faligndata %d2, %d4, %d58       ;\
 379         faligndata %d4, %d6, %d60       ;\
 380         faligndata %d6, %d8, %d62
 381 
 382 #define FALIGN_D10                      \
 383         faligndata %d10, %d12, %d48     ;\
 384         faligndata %d12, %d14, %d50     ;\
 385         faligndata %d14, %d16, %d52     ;\
 386         faligndata %d16, %d18, %d54     ;\
 387         faligndata %d18, %d20, %d56     ;\
 388         faligndata %d20, %d22, %d58     ;\
 389         faligndata %d22, %d24, %d60     ;\
 390         faligndata %d24, %d26, %d62
 391 
 392 #define FALIGN_D26                      \
 393         faligndata %d26, %d28, %d48     ;\
 394         faligndata %d28, %d30, %d50     ;\
 395         faligndata %d30, %d32, %d52     ;\
 396         faligndata %d32, %d34, %d54     ;\
 397         faligndata %d34, %d36, %d56     ;\
 398         faligndata %d36, %d38, %d58     ;\
 399         faligndata %d38, %d40, %d60     ;\
 400         faligndata %d40, %d42, %d62
 401 
 402 #define FALIGN_D42                      \
 403         faligndata %d42, %d44, %d48     ;\
 404         faligndata %d44, %d46, %d50     ;\
 405         faligndata %d46, %d0, %d52      ;\
 406         faligndata %d0, %d2, %d54       ;\
 407         faligndata %d2, %d4, %d56       ;\
 408         faligndata %d4, %d6, %d58       ;\
 409         faligndata %d6, %d8, %d60       ;\
 410         faligndata %d8, %d10, %d62
 411 
 412 #define FALIGN_D12                      \
 413         faligndata %d12, %d14, %d48     ;\
 414         faligndata %d14, %d16, %d50     ;\
 415         faligndata %d16, %d18, %d52     ;\
 416         faligndata %d18, %d20, %d54     ;\
 417         faligndata %d20, %d22, %d56     ;\
 418         faligndata %d22, %d24, %d58     ;\
 419         faligndata %d24, %d26, %d60     ;\
 420         faligndata %d26, %d28, %d62
 421 
 422 #define FALIGN_D28                      \
 423         faligndata %d28, %d30, %d48     ;\
 424         faligndata %d30, %d32, %d50     ;\
 425         faligndata %d32, %d34, %d52     ;\
 426         faligndata %d34, %d36, %d54     ;\
 427         faligndata %d36, %d38, %d56     ;\
 428         faligndata %d38, %d40, %d58     ;\
 429         faligndata %d40, %d42, %d60     ;\
 430         faligndata %d42, %d44, %d62
 431 
 432 #define FALIGN_D44                      \
 433         faligndata %d44, %d46, %d48     ;\
 434         faligndata %d46, %d0, %d50      ;\
 435         faligndata %d0, %d2, %d52       ;\
 436         faligndata %d2, %d4, %d54       ;\
 437         faligndata %d4, %d6, %d56       ;\
 438         faligndata %d6, %d8, %d58       ;\
 439         faligndata %d8, %d10, %d60      ;\
 440         faligndata %d10, %d12, %d62
 441 
 442 #define FALIGN_D14                      \
 443         faligndata %d14, %d16, %d48     ;\
 444         faligndata %d16, %d18, %d50     ;\
 445         faligndata %d18, %d20, %d52     ;\
 446         faligndata %d20, %d22, %d54     ;\
 447         faligndata %d22, %d24, %d56     ;\
 448         faligndata %d24, %d26, %d58     ;\
 449         faligndata %d26, %d28, %d60     ;\
 450         faligndata %d28, %d30, %d62
 451 
 452 #define FALIGN_D30                      \
 453         faligndata %d30, %d32, %d48     ;\
 454         faligndata %d32, %d34, %d50     ;\
 455         faligndata %d34, %d36, %d52     ;\
 456         faligndata %d36, %d38, %d54     ;\
 457         faligndata %d38, %d40, %d56     ;\
 458         faligndata %d40, %d42, %d58     ;\
 459         faligndata %d42, %d44, %d60     ;\
 460         faligndata %d44, %d46, %d62
 461 
 462 #define FALIGN_D46                      \
 463         faligndata %d46, %d0, %d48      ;\
 464         faligndata %d0, %d2, %d50       ;\
 465         faligndata %d2, %d4, %d52       ;\
 466         faligndata %d4, %d6, %d54       ;\
 467         faligndata %d6, %d8, %d56       ;\
 468         faligndata %d8, %d10, %d58      ;\
 469         faligndata %d10, %d12, %d60     ;\
 470         faligndata %d12, %d14, %d62
 471 
 472 
 473 /*
 474  * Copy a block of storage, returning an error code if `from' or
 475  * `to' takes a kernel pagefault which cannot be resolved.
 476  * Returns errno value on pagefault error, 0 if all ok
 477  */
 478 
 479 
 480 
 481         .seg    ".text"
 482         .align  4
 483 
 484         ENTRY(kcopy)
 485 
 486         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 487         set     .copyerr, %l6           ! copyerr is lofault value
 488         ldn     [THREAD_REG + T_LOFAULT], %l7   ! save existing handler
 489         membar  #Sync                   ! sync error barrier (see copy.s)
 490         stn     %l6, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 491         !
 492         ! Note that we carefully do *not* flag the setting of
 493         ! t_lofault.
 494         !
 495         ba,pt   %ncc, .do_copy          ! common code
 496           mov   %l7, %l6
 497 
 498 /*
 499  * We got here because of a fault during kcopy or bcopy if a fault
 500  * handler existed when bcopy was called. 
 501  * Errno value is in %g1.
 502  */
 503 .copyerr:
 504         set     .copyerr2, %l1
 505         membar  #Sync                   ! sync error barrier
 506         stn     %l1, [THREAD_REG + T_LOFAULT]   ! set t_lofault
 507         btst    FPUSED_FLAG, %l6
 508         bz      %icc, 1f
 509           and   %l6, BCOPY_FLAG, %l1    ! copy flag to %l1
 510 
 511         membar  #Sync
 512 
 513         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
 514         wr      %o2, 0, %gsr
 515 
 516         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
 517         btst    FPRS_FEF, %o3
 518         bz      %icc, 4f
 519           nop
 520 
 521         ! restore fpregs from stack
 522         membar  #Sync
 523         add     %fp, STACK_BIAS - 257, %o2
 524         and     %o2, -64, %o2
 525         ldda    [%o2]ASI_BLK_P, %d0
 526         add     %o2, 64, %o2
 527         ldda    [%o2]ASI_BLK_P, %d16
 528         add     %o2, 64, %o2
 529         ldda    [%o2]ASI_BLK_P, %d32
 530         add     %o2, 64, %o2
 531         ldda    [%o2]ASI_BLK_P, %d48
 532         membar  #Sync
 533 
 534         ba,pt   %ncc, 2f
 535           wr    %o3, 0, %fprs           ! restore fprs
 536 
 537 4:
 538         FZERO                           ! zero all of the fpregs
 539         wr      %o3, 0, %fprs           ! restore fprs
 540 
 541 2:      ldn     [THREAD_REG + T_LWP], %o2
 542         tst     %o2
 543         bnz,pt  %ncc, 1f
 544           nop
 545 
 546         ldsb    [THREAD_REG + T_PREEMPT], %l0
 547         deccc   %l0
 548         bnz,pn  %ncc, 1f
 549           stb   %l0, [THREAD_REG + T_PREEMPT]
 550 
 551         ! Check for a kernel preemption request
 552         ldn     [THREAD_REG + T_CPU], %l0
 553         ldub    [%l0 + CPU_KPRUNRUN], %l0
 554         tst     %l0
 555         bnz,a,pt        %ncc, 1f        ! Need to call kpreempt?
 556           or    %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
 557 
 558         !
 559         ! Need to cater for the different expectations of kcopy
 560         ! and bcopy. kcopy will *always* set a t_lofault handler
 561         ! If it fires, we're expected to just return the error code
 562         ! and *not* to invoke any existing error handler. As far as
 563         ! bcopy is concerned, we only set t_lofault if there was an
 564         ! existing lofault handler. In that case we're expected to
 565         ! invoke the previously existing handler after restting the
 566         ! t_lofault value.
 567         !
 568 1:
 569         andn    %l6, COPY_FLAGS, %l6    ! remove flags from lofault address
 570         membar  #Sync                   ! sync error barrier
 571         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
 572 
 573         ! call kpreempt if necessary
 574         btst    KPREEMPT_FLAG, %l1
 575         bz,pt   %icc, 2f
 576           nop
 577         call    kpreempt
 578           rdpr  %pil, %o0       ! pass %pil
 579 2:
 580         btst    BCOPY_FLAG, %l1
 581         bnz,pn  %ncc, 3f
 582           nop
 583         ret
 584         restore %g1, 0, %o0
 585 
 586 3:
 587         !
 588         ! We're here via bcopy. There *must* have been an error handler
 589         ! in place otheerwise we would have died a nasty death already.
 590         !
 591         jmp     %l6                             ! goto real handler
 592         restore %g0, 0, %o0                     ! dispose of copy window
 593 
 594 /*
 595  * We got here because of a fault in .copyerr.  We can't safely restore fp
 596  * state, so we panic.
 597  */
 598 fp_panic_msg:
 599         .asciz  "Unable to restore fp state after copy operation"
 600 
 601         .align  4
 602 .copyerr2:
 603         set     fp_panic_msg, %o0
 604         call    panic
 605           nop
 606         SET_SIZE(kcopy)
 607 
 608 
 609 /*
 610  * Copy a block of storage - must not overlap (from + len <= to).
 611  * Registers: l6 - saved t_lofault
 612  *
 613  * Copy a page of memory.
 614  * Assumes double word alignment and a count >= 256.
 615  */
 616 
 617         ENTRY(bcopy)
 618 
 619         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
 620         ldn     [THREAD_REG + T_LOFAULT], %l6   ! save t_lofault
 621         tst     %l6
 622         !
 623         ! We've already captured whether t_lofault was zero on entry.
 624         ! We need to mark ourselves as being from bcopy since both
 625         ! kcopy and bcopy use the same code path. If BCOPY_FLAG is set
 626         ! and the saved lofault was zero, we won't reset lofault on
 627         ! returning.
 628         !
 629         or      %l6, BCOPY_FLAG, %l6
 630         bz,pt   %ncc, .do_copy
 631         sethi   %hi(.copyerr), %o2
 632         or      %o2, %lo(.copyerr), %o2
 633         membar  #Sync                   ! sync error barrier
 634         stn     %o2, [THREAD_REG + T_LOFAULT]   ! install new vector
 635 
 636 .do_copy:
 637         cmp     %i2, 12                 ! for small counts
 638         blu     %ncc, .bytecp           ! just copy bytes
 639           .empty
 640 
 641         cmp     %i2, VIS_COPY_THRESHOLD ! for large counts
 642         blu,pt  %ncc, .bcb_punt
 643           .empty
 644 
 645         !
 646         ! Check to see if VIS acceleration is enabled
 647         !
 648         sethi   %hi(use_hw_bcopy), %o2
 649         ld      [%o2 + %lo(use_hw_bcopy)], %o2
 650         tst     %o2
 651         bz,pn   %icc, .bcb_punt
 652           nop
 653 
 654         subcc   %i1, %i0, %i3
 655         bneg,a,pn %ncc, 1f
 656         neg     %i3
 657 1:
 658         /*
 659          * Compare against 256 since we should be checking block addresses
 660          * and (dest & ~63) - (src & ~63) can be 3 blocks even if
 661          * src = dest + (64 * 3) + 63.
 662          */
 663         cmp     %i3, 256
 664         blu,pn  %ncc, .bcb_punt
 665           nop
 666 
 667         ldn     [THREAD_REG + T_LWP], %o3
 668         tst     %o3
 669         bnz,pt  %ncc, 1f
 670           nop
 671 
 672         ! kpreempt_disable();
 673         ldsb    [THREAD_REG + T_PREEMPT], %o2
 674         inc     %o2
 675         stb     %o2, [THREAD_REG + T_PREEMPT]
 676 
 677 1:
 678         rd      %fprs, %o2              ! check for unused fp
 679         st      %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
 680         btst    FPRS_FEF, %o2
 681         bz,a    %icc, .do_blockcopy
 682           wr    %g0, FPRS_FEF, %fprs
 683 
 684 .bcb_fpregs_inuse:
 685         cmp     %i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
 686         bgeu    %ncc, 1f                !  if we have to save the fpregs)
 687           nop
 688 
 689         tst     %o3
 690         bnz,pt  %ncc, .bcb_punt
 691           nop
 692 
 693         ldsb    [THREAD_REG + T_PREEMPT], %l0
 694         deccc   %l0
 695         bnz,pn  %icc, .bcb_punt
 696           stb   %l0, [THREAD_REG + T_PREEMPT]
 697 
 698         ! Check for a kernel preemption request
 699         ldn     [THREAD_REG + T_CPU], %l0
 700         ldub    [%l0 + CPU_KPRUNRUN], %l0
 701         tst     %l0
 702         bz,pt   %icc, .bcb_punt
 703           nop
 704 
 705         ! Attempt to preempt
 706         call    kpreempt
 707           rdpr    %pil, %o0               ! pass %pil
 708 
 709         ba,pt   %ncc, .bcb_punt
 710           nop
 711 
 712 1:
 713         wr      %g0, FPRS_FEF, %fprs
 714 
 715         ! save in-use fpregs on stack
 716         membar  #Sync
 717         add     %fp, STACK_BIAS - 257, %o2
 718         and     %o2, -64, %o2
 719         stda    %d0, [%o2]ASI_BLK_P
 720         add     %o2, 64, %o2
 721         stda    %d16, [%o2]ASI_BLK_P
 722         add     %o2, 64, %o2
 723         stda    %d32, [%o2]ASI_BLK_P
 724         add     %o2, 64, %o2
 725         stda    %d48, [%o2]ASI_BLK_P
 726         membar  #Sync
 727 
 728 .do_blockcopy:
 729         membar  #StoreStore|#StoreLoad|#LoadStore
 730 
 731         rd      %gsr, %o2
 732         st      %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
 733 
 734         ! Set the lower bit in the saved t_lofault to indicate
 735         ! that we need to clear the %fprs register on the way
 736         ! out
 737         or      %l6, FPUSED_FLAG, %l6
 738 
 739         ! Swap src/dst since the code below is memcpy code
 740         ! and memcpy/bcopy have different calling sequences
 741         mov     %i1, %i5
 742         mov     %i0, %i1
 743         mov     %i5, %i0
 744 
 745 !!! This code is nearly identical to the version in the sun4u
 746 !!! libc_psr.  Most bugfixes made to that file should be
 747 !!! merged into this routine.
 748 
 749         andcc   %i0, 7, %o3
 750         bz,pt   %ncc, blkcpy
 751         sub     %o3, 8, %o3
 752         neg     %o3
 753         sub     %i2, %o3, %i2
 754 
 755         ! Align Destination on double-word boundary
 756 
 757 2:      ldub    [%i1], %o4
 758         inc     %i1
 759         inc     %i0
 760         deccc   %o3
 761         bgu     %ncc, 2b
 762         stb     %o4, [%i0 - 1]
 763 blkcpy: 
 764         andcc   %i0, 63, %i3
 765         bz,pn   %ncc, blalign           ! now block aligned
 766         sub     %i3, 64, %i3
 767         neg     %i3                     ! bytes till block aligned
 768         sub     %i2, %i3, %i2           ! update %i2 with new count
 769 
 770         ! Copy %i3 bytes till dst is block (64 byte) aligned. use
 771         ! double word copies.
 772 
 773         alignaddr %i1, %g0, %g1
 774         ldd     [%g1], %d0
 775         add     %g1, 8, %g1
 776 6:
 777         ldd     [%g1], %d2
 778         add     %g1, 8, %g1
 779         subcc   %i3, 8, %i3
 780         faligndata %d0, %d2, %d8
 781         std     %d8, [%i0]
 782         add     %i1, 8, %i1
 783         bz,pn   %ncc, blalign
 784         add     %i0, 8, %i0
 785         ldd     [%g1], %d0
 786         add     %g1, 8, %g1
 787         subcc   %i3, 8, %i3
 788         faligndata %d2, %d0, %d8
 789         std     %d8, [%i0]
 790         add     %i1, 8, %i1
 791         bgu,pn  %ncc, 6b
 792         add     %i0, 8, %i0
 793  
 794 blalign:
 795         membar  #StoreLoad
 796         ! %i2 = total length
 797         ! %i3 = blocks  (length - 64) / 64
 798         ! %i4 = doubles remaining  (length - blocks)
 799         sub     %i2, 64, %i3
 800         andn    %i3, 63, %i3
 801         sub     %i2, %i3, %i4
 802         andn    %i4, 7, %i4
 803         sub     %i4, 16, %i4
 804         sub     %i2, %i4, %i2
 805         sub     %i2, %i3, %i2
 806 
 807         andn    %i1, 0x3f, %l7          ! blk aligned address
 808         alignaddr %i1, %g0, %g0         ! gen %gsr
 809 
 810         srl     %i1, 3, %l5             ! bits 3,4,5 are now least sig in  %l5
 811         andcc   %l5, 7, %i5             ! mask everything except bits 1,2 3
 812         add     %i1, %i4, %i1
 813         add     %i1, %i3, %i1
 814 
 815         ldda    [%l7]ASI_BLK_P, %d0
 816         add     %l7, 64, %l7
 817         ldda    [%l7]ASI_BLK_P, %d16
 818         add     %l7, 64, %l7
 819         ldda    [%l7]ASI_BLK_P, %d32
 820         add     %l7, 64, %l7
 821         sub     %i3, 128, %i3
 822 
 823         ! switch statement to get us to the right 8 byte blk within a
 824         ! 64 byte block
 825         cmp      %i5, 4
 826         bgeu,a   hlf
 827         cmp      %i5, 6
 828         cmp      %i5, 2
 829         bgeu,a   sqtr
 830         nop
 831         cmp      %i5, 1
 832         be,a     seg1
 833         nop
 834         ba,pt    %ncc, seg0
 835         nop
 836 sqtr:
 837         be,a     seg2
 838         nop
 839         ba,pt    %ncc, seg3
 840         nop
 841 
 842 hlf:
 843         bgeu,a   fqtr
 844         nop      
 845         cmp      %i5, 5
 846         be,a     seg5
 847         nop
 848         ba,pt    %ncc, seg4
 849         nop
 850 fqtr:
 851         be,a     seg6
 852         nop
 853         ba,pt    %ncc, seg7
 854         nop
 855         
 856 
 857 seg0:
 858         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
 859         FALIGN_D0
 860         ldda    [%l7]ASI_BLK_P, %d0
 861         stda    %d48, [%i0]ASI_BLK_P
 862         add     %l7, 64, %l7
 863         subcc   %i3, 64, %i3
 864         bz,pn   %ncc, 0f
 865         add     %i0, 64, %i0
 866         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
 867         FALIGN_D16
 868         ldda    [%l7]ASI_BLK_P, %d16
 869         stda    %d48, [%i0]ASI_BLK_P
 870         add     %l7, 64, %l7
 871         subcc   %i3, 64, %i3
 872         bz,pn   %ncc, 1f
 873         add     %i0, 64, %i0
 874         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
 875         FALIGN_D32
 876         ldda    [%l7]ASI_BLK_P, %d32
 877         stda    %d48, [%i0]ASI_BLK_P
 878         add     %l7, 64, %l7
 879         subcc   %i3, 64, %i3
 880         bz,pn   %ncc, 2f
 881         add     %i0, 64, %i0
 882         ba,a,pt %ncc, seg0
 883 
 884 0:
 885         FALIGN_D16
 886         stda    %d48, [%i0]ASI_BLK_P
 887         add     %i0, 64, %i0
 888         membar  #Sync
 889         FALIGN_D32
 890         stda    %d48, [%i0]ASI_BLK_P
 891         ba,pt   %ncc, blkd0
 892         add     %i0, 64, %i0
 893 
 894 1:
 895         FALIGN_D32
 896         stda    %d48, [%i0]ASI_BLK_P
 897         add     %i0, 64, %i0
 898         membar  #Sync
 899         FALIGN_D0
 900         stda    %d48, [%i0]ASI_BLK_P
 901         ba,pt   %ncc, blkd16
 902         add     %i0, 64, %i0
 903 
 904 2:
 905         FALIGN_D0
 906         stda    %d48, [%i0]ASI_BLK_P
 907         add     %i0, 64, %i0
 908         membar  #Sync
 909         FALIGN_D16
 910         stda    %d48, [%i0]ASI_BLK_P
 911         ba,pt   %ncc, blkd32
 912         add     %i0, 64, %i0
 913 
 914 seg1:
 915         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
 916         FALIGN_D2
 917         ldda    [%l7]ASI_BLK_P, %d0
 918         stda    %d48, [%i0]ASI_BLK_P
 919         add     %l7, 64, %l7
 920         subcc   %i3, 64, %i3
 921         bz,pn   %ncc, 0f
 922         add     %i0, 64, %i0
 923         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
 924         FALIGN_D18
 925         ldda    [%l7]ASI_BLK_P, %d16
 926         stda    %d48, [%i0]ASI_BLK_P
 927         add     %l7, 64, %l7
 928         subcc   %i3, 64, %i3
 929         bz,pn   %ncc, 1f
 930         add     %i0, 64, %i0
 931         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
 932         FALIGN_D34
 933         ldda    [%l7]ASI_BLK_P, %d32
 934         stda    %d48, [%i0]ASI_BLK_P
 935         add     %l7, 64, %l7
 936         subcc   %i3, 64, %i3
 937         bz,pn   %ncc, 2f
 938         add     %i0, 64, %i0
 939         ba,a,pt %ncc, seg1
 940 0:
 941         FALIGN_D18
 942         stda    %d48, [%i0]ASI_BLK_P
 943         add     %i0, 64, %i0
 944         membar  #Sync
 945         FALIGN_D34
 946         stda    %d48, [%i0]ASI_BLK_P
 947         ba,pt   %ncc, blkd2
 948         add     %i0, 64, %i0
 949 
 950 1:
 951         FALIGN_D34
 952         stda    %d48, [%i0]ASI_BLK_P
 953         add     %i0, 64, %i0
 954         membar  #Sync
 955         FALIGN_D2
 956         stda    %d48, [%i0]ASI_BLK_P
 957         ba,pt   %ncc, blkd18
 958         add     %i0, 64, %i0
 959 
 960 2:
 961         FALIGN_D2
 962         stda    %d48, [%i0]ASI_BLK_P
 963         add     %i0, 64, %i0
 964         membar  #Sync
 965         FALIGN_D18
 966         stda    %d48, [%i0]ASI_BLK_P
 967         ba,pt   %ncc, blkd34
 968         add     %i0, 64, %i0
 969 
 970 seg2:
 971         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
 972         FALIGN_D4
 973         ldda    [%l7]ASI_BLK_P, %d0
 974         stda    %d48, [%i0]ASI_BLK_P
 975         add     %l7, 64, %l7
 976         subcc   %i3, 64, %i3
 977         bz,pn   %ncc, 0f
 978         add     %i0, 64, %i0
 979         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
 980         FALIGN_D20
 981         ldda    [%l7]ASI_BLK_P, %d16
 982         stda    %d48, [%i0]ASI_BLK_P
 983         add     %l7, 64, %l7
 984         subcc   %i3, 64, %i3
 985         bz,pn   %ncc, 1f
 986         add     %i0, 64, %i0
 987         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
 988         FALIGN_D36
 989         ldda    [%l7]ASI_BLK_P, %d32
 990         stda    %d48, [%i0]ASI_BLK_P
 991         add     %l7, 64, %l7
 992         subcc   %i3, 64, %i3
 993         bz,pn   %ncc, 2f
 994         add     %i0, 64, %i0
 995         ba,a,pt %ncc, seg2
 996 
 997 0:
 998         FALIGN_D20
 999         stda    %d48, [%i0]ASI_BLK_P
1000         add     %i0, 64, %i0
1001         membar  #Sync
1002         FALIGN_D36
1003         stda    %d48, [%i0]ASI_BLK_P
1004         ba,pt   %ncc, blkd4
1005         add     %i0, 64, %i0
1006 
1007 1:
1008         FALIGN_D36
1009         stda    %d48, [%i0]ASI_BLK_P
1010         add     %i0, 64, %i0
1011         membar  #Sync
1012         FALIGN_D4
1013         stda    %d48, [%i0]ASI_BLK_P
1014         ba,pt   %ncc, blkd20
1015         add     %i0, 64, %i0
1016 
1017 2:
1018         FALIGN_D4
1019         stda    %d48, [%i0]ASI_BLK_P
1020         add     %i0, 64, %i0
1021         membar  #Sync
1022         FALIGN_D20
1023         stda    %d48, [%i0]ASI_BLK_P
1024         ba,pt   %ncc, blkd36
1025         add     %i0, 64, %i0
1026 
1027 seg3:
1028         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1029         FALIGN_D6
1030         ldda    [%l7]ASI_BLK_P, %d0
1031         stda    %d48, [%i0]ASI_BLK_P
1032         add     %l7, 64, %l7
1033         subcc   %i3, 64, %i3
1034         bz,pn   %ncc, 0f
1035         add     %i0, 64, %i0
1036         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1037         FALIGN_D22
1038         ldda    [%l7]ASI_BLK_P, %d16
1039         stda    %d48, [%i0]ASI_BLK_P
1040         add     %l7, 64, %l7
1041         subcc   %i3, 64, %i3
1042         bz,pn   %ncc, 1f
1043         add     %i0, 64, %i0
1044         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1045         FALIGN_D38
1046         ldda    [%l7]ASI_BLK_P, %d32
1047         stda    %d48, [%i0]ASI_BLK_P
1048         add     %l7, 64, %l7
1049         subcc   %i3, 64, %i3
1050         bz,pn   %ncc, 2f
1051         add     %i0, 64, %i0
1052         ba,a,pt %ncc, seg3
1053 
1054 0:
1055         FALIGN_D22
1056         stda    %d48, [%i0]ASI_BLK_P
1057         add     %i0, 64, %i0
1058         membar  #Sync
1059         FALIGN_D38
1060         stda    %d48, [%i0]ASI_BLK_P
1061         ba,pt   %ncc, blkd6
1062         add     %i0, 64, %i0
1063 
1064 1:
1065         FALIGN_D38
1066         stda    %d48, [%i0]ASI_BLK_P
1067         add     %i0, 64, %i0
1068         membar  #Sync
1069         FALIGN_D6
1070         stda    %d48, [%i0]ASI_BLK_P
1071         ba,pt   %ncc, blkd22
1072         add     %i0, 64, %i0
1073 
1074 2:
1075         FALIGN_D6
1076         stda    %d48, [%i0]ASI_BLK_P
1077         add     %i0, 64, %i0
1078         membar  #Sync
1079         FALIGN_D22
1080         stda    %d48, [%i0]ASI_BLK_P
1081         ba,pt   %ncc, blkd38
1082         add     %i0, 64, %i0
1083 
1084 seg4:
1085         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1086         FALIGN_D8
1087         ldda    [%l7]ASI_BLK_P, %d0
1088         stda    %d48, [%i0]ASI_BLK_P
1089         add     %l7, 64, %l7
1090         subcc   %i3, 64, %i3
1091         bz,pn   %ncc, 0f
1092         add     %i0, 64, %i0
1093         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1094         FALIGN_D24
1095         ldda    [%l7]ASI_BLK_P, %d16
1096         stda    %d48, [%i0]ASI_BLK_P
1097         add     %l7, 64, %l7
1098         subcc   %i3, 64, %i3
1099         bz,pn   %ncc, 1f
1100         add     %i0, 64, %i0
1101         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1102         FALIGN_D40
1103         ldda    [%l7]ASI_BLK_P, %d32
1104         stda    %d48, [%i0]ASI_BLK_P
1105         add     %l7, 64, %l7
1106         subcc   %i3, 64, %i3
1107         bz,pn   %ncc, 2f
1108         add     %i0, 64, %i0
1109         ba,a,pt %ncc, seg4
1110 
1111 0:
1112         FALIGN_D24
1113         stda    %d48, [%i0]ASI_BLK_P
1114         add     %i0, 64, %i0
1115         membar  #Sync
1116         FALIGN_D40
1117         stda    %d48, [%i0]ASI_BLK_P
1118         ba,pt   %ncc, blkd8
1119         add     %i0, 64, %i0
1120 
1121 1:
1122         FALIGN_D40
1123         stda    %d48, [%i0]ASI_BLK_P
1124         add     %i0, 64, %i0
1125         membar  #Sync
1126         FALIGN_D8
1127         stda    %d48, [%i0]ASI_BLK_P
1128         ba,pt   %ncc, blkd24
1129         add     %i0, 64, %i0
1130 
1131 2:
1132         FALIGN_D8
1133         stda    %d48, [%i0]ASI_BLK_P
1134         add     %i0, 64, %i0
1135         membar  #Sync
1136         FALIGN_D24
1137         stda    %d48, [%i0]ASI_BLK_P
1138         ba,pt   %ncc, blkd40
1139         add     %i0, 64, %i0
1140 
1141 seg5:
1142         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1143         FALIGN_D10
1144         ldda    [%l7]ASI_BLK_P, %d0
1145         stda    %d48, [%i0]ASI_BLK_P
1146         add     %l7, 64, %l7
1147         subcc   %i3, 64, %i3
1148         bz,pn   %ncc, 0f
1149         add     %i0, 64, %i0
1150         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1151         FALIGN_D26
1152         ldda    [%l7]ASI_BLK_P, %d16
1153         stda    %d48, [%i0]ASI_BLK_P
1154         add     %l7, 64, %l7
1155         subcc   %i3, 64, %i3
1156         bz,pn   %ncc, 1f
1157         add     %i0, 64, %i0
1158         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1159         FALIGN_D42
1160         ldda    [%l7]ASI_BLK_P, %d32
1161         stda    %d48, [%i0]ASI_BLK_P
1162         add     %l7, 64, %l7
1163         subcc   %i3, 64, %i3
1164         bz,pn   %ncc, 2f
1165         add     %i0, 64, %i0
1166         ba,a,pt %ncc, seg5
1167 
1168 0:
1169         FALIGN_D26
1170         stda    %d48, [%i0]ASI_BLK_P
1171         add     %i0, 64, %i0
1172         membar  #Sync
1173         FALIGN_D42
1174         stda    %d48, [%i0]ASI_BLK_P
1175         ba,pt   %ncc, blkd10
1176         add     %i0, 64, %i0
1177 
1178 1:
1179         FALIGN_D42
1180         stda    %d48, [%i0]ASI_BLK_P
1181         add     %i0, 64, %i0
1182         membar  #Sync
1183         FALIGN_D10
1184         stda    %d48, [%i0]ASI_BLK_P
1185         ba,pt   %ncc, blkd26
1186         add     %i0, 64, %i0
1187 
1188 2:
1189         FALIGN_D10
1190         stda    %d48, [%i0]ASI_BLK_P
1191         add     %i0, 64, %i0
1192         membar  #Sync
1193         FALIGN_D26
1194         stda    %d48, [%i0]ASI_BLK_P
1195         ba,pt   %ncc, blkd42
1196         add     %i0, 64, %i0
1197 
1198 seg6:
1199         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1200         FALIGN_D12
1201         ldda    [%l7]ASI_BLK_P, %d0
1202         stda    %d48, [%i0]ASI_BLK_P
1203         add     %l7, 64, %l7
1204         subcc   %i3, 64, %i3
1205         bz,pn   %ncc, 0f
1206         add     %i0, 64, %i0
1207         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1208         FALIGN_D28
1209         ldda    [%l7]ASI_BLK_P, %d16
1210         stda    %d48, [%i0]ASI_BLK_P
1211         add     %l7, 64, %l7
1212         subcc   %i3, 64, %i3
1213         bz,pn   %ncc, 1f
1214         add     %i0, 64, %i0
1215         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1216         FALIGN_D44
1217         ldda    [%l7]ASI_BLK_P, %d32
1218         stda    %d48, [%i0]ASI_BLK_P
1219         add     %l7, 64, %l7
1220         subcc   %i3, 64, %i3
1221         bz,pn   %ncc, 2f
1222         add     %i0, 64, %i0
1223         ba,a,pt %ncc, seg6
1224 
1225 0:
1226         FALIGN_D28
1227         stda    %d48, [%i0]ASI_BLK_P
1228         add     %i0, 64, %i0
1229         membar  #Sync
1230         FALIGN_D44
1231         stda    %d48, [%i0]ASI_BLK_P
1232         ba,pt   %ncc, blkd12
1233         add     %i0, 64, %i0
1234 
1235 1:
1236         FALIGN_D44
1237         stda    %d48, [%i0]ASI_BLK_P
1238         add     %i0, 64, %i0
1239         membar  #Sync
1240         FALIGN_D12
1241         stda    %d48, [%i0]ASI_BLK_P
1242         ba,pt   %ncc, blkd28
1243         add     %i0, 64, %i0
1244 
1245 2:
1246         FALIGN_D12
1247         stda    %d48, [%i0]ASI_BLK_P
1248         add     %i0, 64, %i0
1249         membar  #Sync
1250         FALIGN_D28
1251         stda    %d48, [%i0]ASI_BLK_P
1252         ba,pt   %ncc, blkd44
1253         add     %i0, 64, %i0
1254 
1255 seg7:
1256         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1257         FALIGN_D14
1258         ldda    [%l7]ASI_BLK_P, %d0
1259         stda    %d48, [%i0]ASI_BLK_P
1260         add     %l7, 64, %l7
1261         subcc   %i3, 64, %i3
1262         bz,pn   %ncc, 0f
1263         add     %i0, 64, %i0
1264         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1265         FALIGN_D30
1266         ldda    [%l7]ASI_BLK_P, %d16
1267         stda    %d48, [%i0]ASI_BLK_P
1268         add     %l7, 64, %l7
1269         subcc   %i3, 64, %i3
1270         bz,pn   %ncc, 1f
1271         add     %i0, 64, %i0
1272         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1273         FALIGN_D46
1274         ldda    [%l7]ASI_BLK_P, %d32
1275         stda    %d48, [%i0]ASI_BLK_P
1276         add     %l7, 64, %l7
1277         subcc   %i3, 64, %i3
1278         bz,pn   %ncc, 2f
1279         add     %i0, 64, %i0
1280         ba,a,pt %ncc, seg7
1281 
1282 0:
1283         FALIGN_D30
1284         stda    %d48, [%i0]ASI_BLK_P
1285         add     %i0, 64, %i0
1286         membar  #Sync
1287         FALIGN_D46
1288         stda    %d48, [%i0]ASI_BLK_P
1289         ba,pt   %ncc, blkd14
1290         add     %i0, 64, %i0
1291 
1292 1:
1293         FALIGN_D46
1294         stda    %d48, [%i0]ASI_BLK_P
1295         add     %i0, 64, %i0
1296         membar  #Sync
1297         FALIGN_D14
1298         stda    %d48, [%i0]ASI_BLK_P
1299         ba,pt   %ncc, blkd30
1300         add     %i0, 64, %i0
1301 
1302 2:
1303         FALIGN_D14
1304         stda    %d48, [%i0]ASI_BLK_P
1305         add     %i0, 64, %i0
1306         membar  #Sync
1307         FALIGN_D30
1308         stda    %d48, [%i0]ASI_BLK_P
1309         ba,pt   %ncc, blkd46
1310         add     %i0, 64, %i0
1311 
1312 
1313         !
1314         ! dribble out the last partial block
1315         !
1316 blkd0:
1317         subcc   %i4, 8, %i4
1318         blu,pn  %ncc, blkdone
1319         faligndata %d0, %d2, %d48
1320         std     %d48, [%i0]
1321         add     %i0, 8, %i0
1322 blkd2:
1323         subcc   %i4, 8, %i4
1324         blu,pn  %ncc, blkdone
1325         faligndata %d2, %d4, %d48
1326         std     %d48, [%i0]
1327         add     %i0, 8, %i0
1328 blkd4:
1329         subcc   %i4, 8, %i4
1330         blu,pn  %ncc, blkdone
1331         faligndata %d4, %d6, %d48
1332         std     %d48, [%i0]
1333         add     %i0, 8, %i0
1334 blkd6:
1335         subcc   %i4, 8, %i4
1336         blu,pn  %ncc, blkdone
1337         faligndata %d6, %d8, %d48
1338         std     %d48, [%i0]
1339         add     %i0, 8, %i0
1340 blkd8:
1341         subcc   %i4, 8, %i4
1342         blu,pn  %ncc, blkdone
1343         faligndata %d8, %d10, %d48
1344         std     %d48, [%i0]
1345         add     %i0, 8, %i0
1346 blkd10:
1347         subcc   %i4, 8, %i4
1348         blu,pn  %ncc, blkdone
1349         faligndata %d10, %d12, %d48
1350         std     %d48, [%i0]
1351         add     %i0, 8, %i0
1352 blkd12:
1353         subcc   %i4, 8, %i4
1354         blu,pn  %ncc, blkdone
1355         faligndata %d12, %d14, %d48
1356         std     %d48, [%i0]
1357         add     %i0, 8, %i0
1358 blkd14:
1359         subcc   %i4, 8, %i4
1360         blu,pn  %ncc, blkdone
1361         fsrc1   %d14, %d0
1362         ba,a,pt %ncc, blkleft
1363 
1364 blkd16:
1365         subcc   %i4, 8, %i4
1366         blu,pn  %ncc, blkdone
1367         faligndata %d16, %d18, %d48
1368         std     %d48, [%i0]
1369         add     %i0, 8, %i0
1370 blkd18:
1371         subcc   %i4, 8, %i4
1372         blu,pn  %ncc, blkdone
1373         faligndata %d18, %d20, %d48
1374         std     %d48, [%i0]
1375         add     %i0, 8, %i0
1376 blkd20:
1377         subcc   %i4, 8, %i4
1378         blu,pn  %ncc, blkdone
1379         faligndata %d20, %d22, %d48
1380         std     %d48, [%i0]
1381         add     %i0, 8, %i0
1382 blkd22:
1383         subcc   %i4, 8, %i4
1384         blu,pn  %ncc, blkdone
1385         faligndata %d22, %d24, %d48
1386         std     %d48, [%i0]
1387         add     %i0, 8, %i0
1388 blkd24:
1389         subcc   %i4, 8, %i4
1390         blu,pn  %ncc, blkdone
1391         faligndata %d24, %d26, %d48
1392         std     %d48, [%i0]
1393         add     %i0, 8, %i0
1394 blkd26:
1395         subcc   %i4, 8, %i4
1396         blu,pn  %ncc, blkdone
1397         faligndata %d26, %d28, %d48
1398         std     %d48, [%i0]
1399         add     %i0, 8, %i0
1400 blkd28:
1401         subcc   %i4, 8, %i4
1402         blu,pn  %ncc, blkdone
1403         faligndata %d28, %d30, %d48
1404         std     %d48, [%i0]
1405         add     %i0, 8, %i0
1406 blkd30:
1407         subcc   %i4, 8, %i4
1408         blu,pn  %ncc, blkdone
1409         fsrc1   %d30, %d0
1410         ba,a,pt %ncc, blkleft
1411 blkd32:
1412         subcc   %i4, 8, %i4
1413         blu,pn  %ncc, blkdone
1414         faligndata %d32, %d34, %d48
1415         std     %d48, [%i0]
1416         add     %i0, 8, %i0
1417 blkd34:
1418         subcc   %i4, 8, %i4
1419         blu,pn  %ncc, blkdone
1420         faligndata %d34, %d36, %d48
1421         std     %d48, [%i0]
1422         add     %i0, 8, %i0
1423 blkd36:
1424         subcc   %i4, 8, %i4
1425         blu,pn  %ncc, blkdone
1426         faligndata %d36, %d38, %d48
1427         std     %d48, [%i0]
1428         add     %i0, 8, %i0
1429 blkd38:
1430         subcc   %i4, 8, %i4
1431         blu,pn  %ncc, blkdone
1432         faligndata %d38, %d40, %d48
1433         std     %d48, [%i0]
1434         add     %i0, 8, %i0
1435 blkd40:
1436         subcc   %i4, 8, %i4
1437         blu,pn  %ncc, blkdone
1438         faligndata %d40, %d42, %d48
1439         std     %d48, [%i0]
1440         add     %i0, 8, %i0
1441 blkd42:
1442         subcc   %i4, 8, %i4
1443         blu,pn  %ncc, blkdone
1444         faligndata %d42, %d44, %d48
1445         std     %d48, [%i0]
1446         add     %i0, 8, %i0
1447 blkd44:
1448         subcc   %i4, 8, %i4
1449         blu,pn  %ncc, blkdone
1450         faligndata %d44, %d46, %d48
1451         std     %d48, [%i0]
1452         add     %i0, 8, %i0
1453 blkd46:
1454         subcc   %i4, 8, %i4
1455         blu,pn  %ncc, blkdone
1456         fsrc1   %d46, %d0
1457 
1458 blkleft:
1459 1:
1460         ldd     [%l7], %d2
1461         add     %l7, 8, %l7
1462         subcc   %i4, 8, %i4
1463         faligndata %d0, %d2, %d8
1464         std     %d8, [%i0]
1465         blu,pn  %ncc, blkdone
1466         add     %i0, 8, %i0
1467         ldd     [%l7], %d0
1468         add     %l7, 8, %l7
1469         subcc   %i4, 8, %i4
1470         faligndata %d2, %d0, %d8
1471         std     %d8, [%i0]
1472         bgeu,pt %ncc, 1b
1473         add     %i0, 8, %i0
1474 
1475 blkdone:
1476         tst     %i2
1477         bz,pt   %ncc, .bcb_exit
1478         and     %l3, 0x4, %l3           ! fprs.du = fprs.dl = 0
1479 
1480 7:      ldub    [%i1], %i4
1481         inc     %i1
1482         inc     %i0
1483         deccc   %i2
1484         bgu,pt  %ncc, 7b
1485           stb     %i4, [%i0 - 1]
1486 
1487 .bcb_exit:
1488         membar  #StoreLoad|#StoreStore
1489         btst    FPUSED_FLAG, %l6
1490         bz      %icc, 1f
1491           and   %l6, COPY_FLAGS, %l1    ! Store flags in %l1
1492                                         ! We can't clear the flags from %l6 yet.
1493                                         ! If there's an error, .copyerr will
1494                                         ! need them
1495 
1496         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1497         wr      %o2, 0, %gsr
1498 
1499         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1500         btst    FPRS_FEF, %o3
1501         bz      %icc, 4f
1502           nop
1503 
1504         ! restore fpregs from stack
1505         membar  #Sync
1506         add     %fp, STACK_BIAS - 257, %o2
1507         and     %o2, -64, %o2
1508         ldda    [%o2]ASI_BLK_P, %d0
1509         add     %o2, 64, %o2
1510         ldda    [%o2]ASI_BLK_P, %d16
1511         add     %o2, 64, %o2
1512         ldda    [%o2]ASI_BLK_P, %d32
1513         add     %o2, 64, %o2
1514         ldda    [%o2]ASI_BLK_P, %d48
1515         membar  #Sync
1516 
1517         ba,pt   %ncc, 2f        
1518           wr    %o3, 0, %fprs           ! restore fprs
1519 
1520 4:
1521         FZERO                           ! zero all of the fpregs
1522         wr      %o3, 0, %fprs           ! restore fprs
1523 
1524 2:      ldn     [THREAD_REG + T_LWP], %o2
1525         tst     %o2
1526         bnz,pt  %ncc, 1f
1527           nop
1528 
1529         ldsb    [THREAD_REG + T_PREEMPT], %l0
1530         deccc   %l0
1531         bnz,pn  %ncc, 1f
1532           stb   %l0, [THREAD_REG + T_PREEMPT]
1533 
1534         ! Check for a kernel preemption request
1535         ldn     [THREAD_REG + T_CPU], %l0
1536         ldub    [%l0 + CPU_KPRUNRUN], %l0
1537         tst     %l0
1538         bnz,a,pt        %ncc, 1f        ! Need to call kpreempt?
1539           or    %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag
1540 
1541 1:
1542         btst    BCOPY_FLAG, %l1
1543         bz,pn   %icc, 3f
1544           andncc        %l6, COPY_FLAGS, %l6
1545 
1546         !
1547         ! Here via bcopy. Check to see if the handler was NULL.
1548         ! If so, just return quietly. Otherwise, reset the
1549         ! handler and go home.
1550         ! 
1551         bnz,pn  %ncc, 3f
1552           nop
1553 
1554         !
1555         ! Null handler.  Check for kpreempt flag, call if necessary,
1556         ! then return.
1557         !
1558         btst    KPREEMPT_FLAG, %l1
1559         bz,pt   %icc, 2f
1560           nop
1561         call    kpreempt
1562           rdpr  %pil, %o0       ! pass %pil
1563 2:
1564         ret
1565           restore       %g0, 0, %o0
1566 
1567         !
1568         ! Here via kcopy or bcopy with a handler.Reset the
1569         ! fault handler.
1570         !
1571 3:
1572         membar  #Sync
1573         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1574 
1575         ! call kpreempt if necessary
1576         btst    KPREEMPT_FLAG, %l1
1577         bz,pt   %icc, 4f
1578           nop
1579         call    kpreempt
1580           rdpr  %pil, %o0
1581 4:
1582         ret
1583           restore       %g0, 0, %o0
1584 
1585 .bcb_punt:
1586         !
1587         ! use aligned transfers where possible
1588         !
1589         xor     %i0, %i1, %o4           ! xor from and to address
1590         btst    7, %o4                  ! if lower three bits zero
1591         bz      %icc, .aldoubcp         ! can align on double boundary
1592         .empty  ! assembler complaints about label
1593 
1594         xor     %i0, %i1, %o4           ! xor from and to address
1595         btst    3, %o4                  ! if lower two bits zero
1596         bz      %icc, .alwordcp         ! can align on word boundary
1597         btst    3, %i0                  ! delay slot, from address unaligned?
1598         !
1599         ! use aligned reads and writes where possible
1600         ! this differs from wordcp in that it copes
1601         ! with odd alignment between source and destnation
1602         ! using word reads and writes with the proper shifts
1603         ! in between to align transfers to and from memory
1604         ! i0 - src address, i1 - dest address, i2 - count
1605         ! i3, i4 - tmps for used generating complete word
1606         ! i5 (word to write)
1607         ! l0 size in bits of upper part of source word (US)
1608         ! l1 size in bits of lower part of source word (LS = 32 - US)
1609         ! l2 size in bits of upper part of destination word (UD)
1610         ! l3 size in bits of lower part of destination word (LD = 32 - UD)
1611         ! l4 number of bytes leftover after aligned transfers complete
1612         ! l5 the number 32
1613         !
1614         mov     32, %l5                 ! load an oft-needed constant
1615         bz      .align_dst_only
1616         btst    3, %i1                  ! is destnation address aligned?
1617         clr     %i4                     ! clear registers used in either case
1618         bz      %icc, .align_src_only
1619         clr     %l0
1620         !
1621         ! both source and destination addresses are unaligned
1622         !
1623 1:                                      ! align source
1624         ldub    [%i0], %i3              ! read a byte from source address
1625         add     %i0, 1, %i0             ! increment source address
1626         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
1627         btst    3, %i0                  ! is source aligned?
1628         add     %l0, 8, %l0             ! increment size of upper source (US)
1629         bnz,a   1b
1630         sll     %i4, 8, %i4             ! make room for next byte
1631 
1632         sub     %l5, %l0, %l1           ! generate shift left count (LS)
1633         sll     %i4, %l1, %i4           ! prepare to get rest
1634         ld      [%i0], %i3              ! read a word
1635         add     %i0, 4, %i0             ! increment source address
1636         srl     %i3, %l0, %i5           ! upper src bits into lower dst bits
1637         or      %i4, %i5, %i5           ! merge
1638         mov     24, %l3                 ! align destination
1639 1:
1640         srl     %i5, %l3, %i4           ! prepare to write a single byte
1641         stb     %i4, [%i1]              ! write a byte
1642         add     %i1, 1, %i1             ! increment destination address
1643         sub     %i2, 1, %i2             ! decrement count
1644         btst    3, %i1                  ! is destination aligned?
1645         bnz,a   1b
1646         sub     %l3, 8, %l3             ! delay slot, decrement shift count (LD)
1647         sub     %l5, %l3, %l2           ! generate shift left count (UD)
1648         sll     %i5, %l2, %i5           ! move leftover into upper bytes
1649         cmp     %l2, %l0                ! cmp # reqd to fill dst w old src left
1650         bgu     %ncc, .more_needed      ! need more to fill than we have
1651         nop
1652 
1653         sll     %i3, %l1, %i3           ! clear upper used byte(s)
1654         srl     %i3, %l1, %i3
1655         ! get the odd bytes between alignments
1656         sub     %l0, %l2, %l0           ! regenerate shift count
1657         sub     %l5, %l0, %l1           ! generate new shift left count (LS)
1658         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
1659         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
1660         srl     %i3, %l0, %i4
1661         or      %i5, %i4, %i5
1662         st      %i5, [%i1]              ! write a word
1663         subcc   %i2, 4, %i2             ! decrement count
1664         bz      %ncc, .unalign_out
1665         add     %i1, 4, %i1             ! increment destination address
1666 
1667         b       2f
1668         sll     %i3, %l1, %i5           ! get leftover into upper bits
1669 .more_needed:
1670         sll     %i3, %l0, %i3           ! save remaining byte(s)
1671         srl     %i3, %l0, %i3
1672         sub     %l2, %l0, %l1           ! regenerate shift count
1673         sub     %l5, %l1, %l0           ! generate new shift left count
1674         sll     %i3, %l1, %i4           ! move to fill empty space
1675         b       3f
1676         or      %i5, %i4, %i5           ! merge to complete word
1677         !
1678         ! the source address is aligned and destination is not
1679         !
1680 .align_dst_only:
1681         ld      [%i0], %i4              ! read a word
1682         add     %i0, 4, %i0             ! increment source address
1683         mov     24, %l0                 ! initial shift alignment count
1684 1:
1685         srl     %i4, %l0, %i3           ! prepare to write a single byte
1686         stb     %i3, [%i1]              ! write a byte
1687         add     %i1, 1, %i1             ! increment destination address
1688         sub     %i2, 1, %i2             ! decrement count
1689         btst    3, %i1                  ! is destination aligned?
1690         bnz,a   1b
1691         sub     %l0, 8, %l0             ! delay slot, decrement shift count
1692 .xfer:
1693         sub     %l5, %l0, %l1           ! generate shift left count
1694         sll     %i4, %l1, %i5           ! get leftover
1695 3:
1696         and     %i2, 3, %l4             ! must do remaining bytes if count%4 > 0
1697         andn    %i2, 3, %i2             ! # of aligned bytes that can be moved
1698 2:
1699         ld      [%i0], %i3              ! read a source word
1700         add     %i0, 4, %i0             ! increment source address
1701         srl     %i3, %l0, %i4           ! upper src bits into lower dst bits
1702         or      %i5, %i4, %i5           ! merge with upper dest bits (leftover)
1703         st      %i5, [%i1]              ! write a destination word
1704         subcc   %i2, 4, %i2             ! decrement count
1705         bz      %ncc, .unalign_out      ! check if done
1706         add     %i1, 4, %i1             ! increment destination address
1707         b       2b                      ! loop
1708         sll     %i3, %l1, %i5           ! get leftover
1709 .unalign_out:
1710         tst     %l4                     ! any bytes leftover?
1711         bz      %ncc, .cpdone
1712         .empty                          ! allow next instruction in delay slot
1713 1:
1714         sub     %l0, 8, %l0             ! decrement shift
1715         srl     %i3, %l0, %i4           ! upper src byte into lower dst byte
1716         stb     %i4, [%i1]              ! write a byte
1717         subcc   %l4, 1, %l4             ! decrement count
1718         bz      %ncc, .cpdone           ! done?
1719         add     %i1, 1, %i1             ! increment destination
1720         tst     %l0                     ! any more previously read bytes
1721         bnz     %ncc, 1b                ! we have leftover bytes
1722         mov     %l4, %i2                ! delay slot, mv cnt where dbytecp wants
1723         b       .dbytecp                ! let dbytecp do the rest
1724         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
1725         !
1726         ! the destination address is aligned and the source is not
1727         !
1728 .align_src_only:
1729         ldub    [%i0], %i3              ! read a byte from source address
1730         add     %i0, 1, %i0             ! increment source address
1731         or      %i4, %i3, %i4           ! or in with previous bytes (if any)
1732         btst    3, %i0                  ! is source aligned?
1733         add     %l0, 8, %l0             ! increment shift count (US)
1734         bnz,a   .align_src_only
1735         sll     %i4, 8, %i4             ! make room for next byte
1736         b,a     .xfer
1737         !
1738         ! if from address unaligned for double-word moves,
1739         ! move bytes till it is, if count is < 56 it could take
1740         ! longer to align the thing than to do the transfer
1741         ! in word size chunks right away
1742         !
1743 .aldoubcp:
1744         cmp     %i2, 56                 ! if count < 56, use wordcp, it takes
1745         blu,a   %ncc, .alwordcp         ! longer to align doubles than words
1746         mov     3, %o0                  ! mask for word alignment
1747         call    .alignit                ! copy bytes until aligned
1748         mov     7, %o0                  ! mask for double alignment
1749         !
1750         ! source and destination are now double-word aligned
1751         ! i3 has aligned count returned by alignit
1752         !
1753         and     %i2, 7, %i2             ! unaligned leftover count
1754         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
1755 5:
1756         ldx     [%i0+%i1], %o4          ! read from address
1757         stx     %o4, [%i1]              ! write at destination address
1758         subcc   %i3, 8, %i3             ! dec count
1759         bgu     %ncc, 5b
1760         add     %i1, 8, %i1             ! delay slot, inc to address
1761         cmp     %i2, 4                  ! see if we can copy a word
1762         blu     %ncc, .dbytecp          ! if 3 or less bytes use bytecp
1763         .empty
1764         !
1765         ! for leftover bytes we fall into wordcp, if needed
1766         !
1767 .wordcp:
1768         and     %i2, 3, %i2             ! unaligned leftover count
1769 5:
1770         ld      [%i0+%i1], %o4          ! read from address
1771         st      %o4, [%i1]              ! write at destination address
1772         subcc   %i3, 4, %i3             ! dec count
1773         bgu     %ncc, 5b
1774         add     %i1, 4, %i1             ! delay slot, inc to address
1775         b,a     .dbytecp
1776 
1777         ! we come here to align copies on word boundaries
1778 .alwordcp:
1779         call    .alignit                ! go word-align it
1780         mov     3, %o0                  ! bits that must be zero to be aligned
1781         b       .wordcp
1782         sub     %i0, %i1, %i0           ! i0 gets the difference of src and dst
1783 
1784         !
1785         ! byte copy, works with any alignment
1786         !
1787 .bytecp:
1788         b       .dbytecp
1789         sub     %i0, %i1, %i0           ! i0 gets difference of src and dst
1790 
1791         !
1792         ! differenced byte copy, works with any alignment
1793         ! assumes dest in %i1 and (source - dest) in %i0
1794         !
1795 1:
1796         stb     %o4, [%i1]              ! write to address
1797         inc     %i1                     ! inc to address
1798 .dbytecp:
1799         deccc   %i2                     ! dec count
1800         bgeu,a  %ncc, 1b                ! loop till done
1801         ldub    [%i0+%i1], %o4          ! read from address
1802         !
1803         ! FPUSED_FLAG will not have been set in any path leading to
1804         ! this point. No need to deal with it.
1805         !
1806 .cpdone:
1807         btst    BCOPY_FLAG, %l6
1808         bz,pn   %icc, 2f
1809         andncc  %l6, BCOPY_FLAG, %l6
1810         !
1811         ! Here via bcopy. Check to see if the handler was NULL.
1812         ! If so, just return quietly. Otherwise, reset the
1813         ! handler and go home.
1814         !
1815         bnz,pn  %ncc, 2f
1816         nop
1817         !
1818         ! Null handler.
1819         !
1820         ret
1821         restore %g0, 0, %o0
1822         !
1823         ! Here via kcopy or bcopy with a handler.Reset the
1824         ! fault handler.
1825         !
1826 2:
1827         membar  #Sync
1828         stn     %l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1829         ret
1830         restore %g0, 0, %o0             ! return (0)
1831 
1832 /*
1833  * Common code used to align transfers on word and doubleword
1834  * boudaries.  Aligns source and destination and returns a count
1835  * of aligned bytes to transfer in %i3
1836  */
1837 1:
1838         inc     %i0                     ! inc from
1839         stb     %o4, [%i1]              ! write a byte
1840         inc     %i1                     ! inc to
1841         dec     %i2                     ! dec count
1842 .alignit:
1843         btst    %o0, %i0                ! %o0 is bit mask to check for alignment
1844         bnz,a   1b
1845         ldub    [%i0], %o4              ! read next byte
1846 
1847         retl
1848         andn    %i2, %o0, %i3           ! return size of aligned bytes
1849         SET_SIZE(bcopy)
1850 
1851 /*
1852  * Block copy with possibly overlapped operands.
1853  */
1854 
1855         ENTRY(ovbcopy)
1856         tst     %o2                     ! check count
1857         bgu,a   %ncc, 1f                ! nothing to do or bad arguments
1858         subcc   %o0, %o1, %o3           ! difference of from and to address
1859 
1860         retl                            ! return
1861         nop
1862 1:
1863         bneg,a  %ncc, 2f
1864         neg     %o3                     ! if < 0, make it positive
1865 2:      cmp     %o2, %o3                ! cmp size and abs(from - to)
1866         bleu    %ncc, bcopy             ! if size <= abs(diff): use bcopy,
1867         .empty                          !   no overlap
1868         cmp     %o0, %o1                ! compare from and to addresses
1869         blu     %ncc, .ov_bkwd          ! if from < to, copy backwards
1870         nop
1871         !
1872         ! Copy forwards.
1873         !
1874 .ov_fwd:
1875         ldub    [%o0], %o3              ! read from address
1876         inc     %o0                     ! inc from address
1877         stb     %o3, [%o1]              ! write to address
1878         deccc   %o2                     ! dec count
1879         bgu     %ncc, .ov_fwd           ! loop till done
1880         inc     %o1                     ! inc to address
1881 
1882         retl                            ! return
1883         nop
1884         !
1885         ! Copy backwards.
1886         !
1887 .ov_bkwd:
1888         deccc   %o2                     ! dec count
1889         ldub    [%o0 + %o2], %o3        ! get byte at end of src
1890         bgu     %ncc, .ov_bkwd          ! loop till done
1891         stb     %o3, [%o1 + %o2]        ! delay slot, store at end of dst
1892 
1893         retl                            ! return
1894         nop
1895         SET_SIZE(ovbcopy)
1896 
1897 /*
1898  * hwblkpagecopy()
1899  *
1900  * Copies exactly one page.  This routine assumes the caller (ppcopy)
1901  * has already disabled kernel preemption and has checked
1902  * use_hw_bcopy.
1903  */
1904         ENTRY(hwblkpagecopy)
1905         ! get another window w/space for three aligned blocks of saved fpregs
1906         save    %sp, -SA(MINFRAME + 4*64), %sp
1907 
1908         ! %i0 - source address (arg)
1909         ! %i1 - destination address (arg)
1910         ! %i2 - length of region (not arg)
1911         ! %l0 - saved fprs
1912         ! %l1 - pointer to saved fpregs
1913 
1914         rd      %fprs, %l0              ! check for unused fp
1915         btst    FPRS_FEF, %l0
1916         bz      1f
1917         membar  #Sync
1918 
1919         ! save in-use fpregs on stack
1920         add     %fp, STACK_BIAS - 193, %l1
1921         and     %l1, -64, %l1
1922         stda    %d0, [%l1]ASI_BLK_P
1923         add     %l1, 64, %l3
1924         stda    %d16, [%l3]ASI_BLK_P
1925         add     %l3, 64, %l3
1926         stda    %d32, [%l3]ASI_BLK_P
1927         membar  #Sync
1928 
1929 1:      wr      %g0, FPRS_FEF, %fprs
1930         ldda    [%i0]ASI_BLK_P, %d0
1931         add     %i0, 64, %i0
1932         set     PAGESIZE - 64, %i2
1933 
1934 2:      ldda    [%i0]ASI_BLK_P, %d16
1935         fsrc1   %d0, %d32
1936         fsrc1   %d2, %d34
1937         fsrc1   %d4, %d36
1938         fsrc1   %d6, %d38
1939         fsrc1   %d8, %d40
1940         fsrc1   %d10, %d42
1941         fsrc1   %d12, %d44
1942         fsrc1   %d14, %d46
1943         stda    %d32, [%i1]ASI_BLK_P
1944         add     %i0, 64, %i0
1945         subcc   %i2, 64, %i2
1946         bz,pn   %ncc, 3f
1947         add     %i1, 64, %i1
1948         ldda    [%i0]ASI_BLK_P, %d0
1949         fsrc1   %d16, %d32
1950         fsrc1   %d18, %d34
1951         fsrc1   %d20, %d36
1952         fsrc1   %d22, %d38
1953         fsrc1   %d24, %d40
1954         fsrc1   %d26, %d42
1955         fsrc1   %d28, %d44
1956         fsrc1   %d30, %d46
1957         stda    %d32, [%i1]ASI_BLK_P
1958         add     %i0, 64, %i0
1959         sub     %i2, 64, %i2
1960         ba,pt   %ncc, 2b
1961         add     %i1, 64, %i1
1962 
1963 3:      membar  #Sync
1964         btst    FPRS_FEF, %l0
1965         bz      4f
1966         stda    %d16, [%i1]ASI_BLK_P
1967 
1968         ! restore fpregs from stack
1969         membar  #Sync
1970         ldda    [%l1]ASI_BLK_P, %d0
1971         add     %l1, 64, %l3
1972         ldda    [%l3]ASI_BLK_P, %d16
1973         add     %l3, 64, %l3
1974         ldda    [%l3]ASI_BLK_P, %d32
1975 
1976 4:      wr      %l0, 0, %fprs           ! restore fprs
1977         membar #Sync
1978         ret
1979         restore %g0, 0, %o0
1980         SET_SIZE(hwblkpagecopy)
1981 
1982 
1983 /*
1984  * Transfer data to and from user space -
1985  * Note that these routines can cause faults
1986  * It is assumed that the kernel has nothing at
1987  * less than KERNELBASE in the virtual address space.
1988  *
1989  * Note that copyin(9F) and copyout(9F) are part of the
1990  * DDI/DKI which specifies that they return '-1' on "errors."
1991  *
1992  * Sigh.
1993  *
1994  * So there's two extremely similar routines - xcopyin() and xcopyout()
1995  * which return the errno that we've faithfully computed.  This
1996  * allows other callers (e.g. uiomove(9F)) to work correctly.
1997  * Given that these are used pretty heavily, we expand the calling
1998  * sequences inline for all flavours (rather than making wrappers).
1999  *
2000  * There are also stub routines for xcopyout_little and xcopyin_little,
2001  * which currently are intended to handle requests of <= 16 bytes from
2002  * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2003  * is left as an exercise...
2004  */
2005 
2006 /*
2007  * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2008  *
2009  * General theory of operation:
2010  *
2011  * The only difference between default_copy{in,out} and
2012  * default_xcopy{in,out} is in the error handling routine they invoke
2013  * when a memory access error is seen. default_xcopyOP returns the errno
2014  * while default_copyOP returns -1 (see above). copy{in,out}_noerr set
2015  * a special flag (by oring the value 2 into the fault handler address)
2016  * if they are called with a fault handler already in place. That flag
2017  * causes the default handlers to trampoline to the previous handler
2018  * upon an error.
2019  *
2020  * None of the copyops routines grab a window until it's decided that
2021  * we need to do a HW block copy operation. This saves a window
2022  * spill/fill when we're called during socket ops. The typical IO
2023  * path won't cause spill/fill traps.
2024  *
2025  * This code uses a set of 4 limits for the maximum size that will
2026  * be copied given a particular input/output address alignment.
2027  * the default limits are:
2028  *
2029  * single byte aligned - 900 (hw_copy_limit_1)
2030  * two byte aligned - 1800 (hw_copy_limit_2)
2031  * four byte aligned - 3600 (hw_copy_limit_4)
2032  * eight byte aligned - 7200 (hw_copy_limit_8)
2033  *
2034  * If the value for a particular limit is zero, the copy will be done
2035  * via the copy loops rather than VIS.
2036  *
2037  * Flow:
2038  *
2039  * If count == zero return zero.
2040  *
2041  * Store the previous lo_fault handler into %g6.
2042  * Place our secondary lofault handler into %g5.
2043  * Place the address of our nowindow fault handler into %o3.
2044  * Place the address of the windowed fault handler into %o4.
2045  * --> We'll use this handler if we end up grabbing a window
2046  * --> before we use VIS instructions.
2047  *
2048  * If count is less than or equal to SMALL_LIMIT (7) we
2049  * always do a byte for byte copy.
2050  *
2051  * If count is > SMALL_LIMIT, we check the alignment of the input
2052  * and output pointers. Based on the alignment we check count
2053  * against a soft limit of VIS_COPY_THRESHOLD (900 on spitfire). If
2054  * we're larger than VIS_COPY_THRESHOLD, we check against a limit based
2055  * on detected alignment. If we exceed the alignment value we copy
2056  * via VIS instructions.
2057  *
2058  * If we don't exceed one of the limits, we store -count in %o3,
2059  * we store the number of chunks (8, 4, 2 or 1 byte) operated
2060  * on in our basic copy loop in %o2. Following this we branch 
2061  * to the appropriate copy loop and copy that many chunks.
2062  * Since we've been adding the chunk size to %o3 each time through
2063  * as well as decrementing %o2, we can tell if any data is
2064  * is left to be copied by examining %o3. If that is zero, we're
2065  * done and can go home. If not, we figure out what the largest
2066  * chunk size left to be copied is and branch to that copy loop
2067  * unless there's only one byte left. We load that as we're
2068  * branching to code that stores it just before we return.
2069  *
2070  * There is one potential situation in which we start to do a VIS
2071  * copy but decide to punt and return to the copy loops. There is
2072  * (in the default configuration) a window of 256 bytes between
2073  * the single byte aligned copy limit and what VIS treats as its
2074  * minimum if floating point is in use in the calling app. We need
2075  * to be prepared to handle this. See the .small_copyOP label for
2076  * details.
2077  *
2078  * Fault handlers are invoked if we reference memory that has no
2079  * current mapping.  All forms share the same copyio_fault handler.
2080  * This routine handles fixing up the stack and general housecleaning.
2081  * Each copy operation has a simple fault handler that is then called
2082  * to do the work specific to the invidual operation.  The handlers
2083  * for default_copyOP and copyOP_noerr are found at the end of
2084  * default_copyout. The handlers for default_xcopyOP are found at the
2085  * end of xdefault_copyin.
2086  */
2087 
2088 /*
2089  * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2090  */
2091 
2092 /*
2093  * We save the arguments in the following registers in case of a fault:
2094  *      kaddr - %g2
2095  *      uaddr - %g3
2096  *      count - %g4
2097  */
2098 #define SAVE_SRC        %g2
2099 #define SAVE_DST        %g3
2100 #define SAVE_COUNT      %g4
2101 
2102 #define REAL_LOFAULT            %g5
2103 #define SAVED_LOFAULT           %g6
2104 
2105 /*
2106  * Generic copyio fault handler.  This is the first line of defense when a 
2107  * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2108  * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2109  * This allows us to share common code for all the flavors of the copy
2110  * operations, including the _noerr versions.
2111  *
2112  * Note that this function will restore the original input parameters before
2113  * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2114  * member of the t_copyop structure, if needed.
2115  */
2116         ENTRY(copyio_fault)
2117         btst    FPUSED_FLAG, SAVED_LOFAULT
2118         bz      1f
2119           andn  SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2120 
2121         membar  #Sync
2122 
2123         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2124         wr      %o2, 0, %gsr            ! restore gsr
2125 
2126         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2127         btst    FPRS_FEF, %o3
2128         bz      4f
2129           nop
2130 
2131         ! restore fpregs from stack
2132         membar  #Sync
2133         add     %fp, STACK_BIAS - 257, %o2
2134         and     %o2, -64, %o2
2135         ldda    [%o2]ASI_BLK_P, %d0
2136         add     %o2, 64, %o2
2137         ldda    [%o2]ASI_BLK_P, %d16
2138         add     %o2, 64, %o2
2139         ldda    [%o2]ASI_BLK_P, %d32
2140         add     %o2, 64, %o2
2141         ldda    [%o2]ASI_BLK_P, %d48
2142         membar  #Sync
2143 
2144         ba,pt   %ncc, 1f
2145           wr    %o3, 0, %fprs           ! restore fprs
2146 
2147 4:
2148         FZERO                           ! zero all of the fpregs
2149         wr      %o3, 0, %fprs           ! restore fprs
2150 
2151 1:
2152 
2153         restore
2154 
2155         mov     SAVE_SRC, %o0
2156         mov     SAVE_DST, %o1
2157         jmp     REAL_LOFAULT
2158           mov   SAVE_COUNT, %o2
2159         SET_SIZE(copyio_fault)
2160 
2161         ENTRY(copyio_fault_nowindow)
2162         membar  #Sync
2163         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2164 
2165         mov     SAVE_SRC, %o0
2166         mov     SAVE_DST, %o1
2167         jmp     REAL_LOFAULT
2168           mov   SAVE_COUNT, %o2
2169         SET_SIZE(copyio_fault_nowindow)
2170 
2171         ENTRY(copyout)
2172         sethi   %hi(.copyout_err), REAL_LOFAULT
2173         or      REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2174 
2175 .do_copyout:
2176         !
2177         ! Check the length and bail if zero.
2178         !
2179         tst     %o2
2180         bnz,pt  %ncc, 1f
2181           nop
2182         retl
2183           clr   %o0
2184 1:
2185         sethi   %hi(copyio_fault), %o4
2186         or      %o4, %lo(copyio_fault), %o4
2187         sethi   %hi(copyio_fault_nowindow), %o3
2188         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2189         or      %o3, %lo(copyio_fault_nowindow), %o3
2190         membar  #Sync
2191         stn     %o3, [THREAD_REG + T_LOFAULT]
2192 
2193         mov     %o0, SAVE_SRC
2194         mov     %o1, SAVE_DST
2195         mov     %o2, SAVE_COUNT
2196 
2197         !
2198         ! Check to see if we're more than SMALL_LIMIT (7 bytes).
2199         ! Run in leaf mode, using the %o regs as our input regs.
2200         !
2201         subcc   %o2, SMALL_LIMIT, %o3
2202         bgu,a,pt %ncc, .dco_ns
2203         or      %o0, %o1, %o3
2204         !
2205         ! What was previously ".small_copyout"
2206         ! Do full differenced copy.
2207         !
2208 .dcobcp:
2209         sub     %g0, %o2, %o3           ! negate count
2210         add     %o0, %o2, %o0           ! make %o0 point at the end
2211         add     %o1, %o2, %o1           ! make %o1 point at the end
2212         ba,pt   %ncc, .dcocl
2213         ldub    [%o0 + %o3], %o4        ! load first byte
2214         !
2215         ! %o0 and %o2 point at the end and remain pointing at the end
2216         ! of their buffers. We pull things out by adding %o3 (which is
2217         ! the negation of the length) to the buffer end which gives us
2218         ! the curent location in the buffers. By incrementing %o3 we walk
2219         ! through both buffers without having to bump each buffer's
2220         ! pointer. A very fast 4 instruction loop.
2221         !
2222         .align 16
2223 .dcocl:
2224         stba    %o4, [%o1 + %o3]ASI_USER
2225         inccc   %o3
2226         bl,a,pt %ncc, .dcocl
2227         ldub    [%o0 + %o3], %o4
2228         !
2229         ! We're done. Go home.
2230         !
2231         membar  #Sync
2232         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
2233         retl
2234         clr     %o0
2235         !
2236         ! Try aligned copies from here.
2237         !
2238 .dco_ns:
2239         ! %o0 = kernel addr (to be copied from)
2240         ! %o1 = user addr (to be copied to)
2241         ! %o2 = length
2242         ! %o3 = %o1 | %o2 (used for alignment checking)
2243         ! %o4 is alternate lo_fault
2244         ! %o5 is original lo_fault
2245         !
2246         ! See if we're single byte aligned. If we are, check the
2247         ! limit for single byte copies. If we're smaller or equal,
2248         ! bounce to the byte for byte copy loop. Otherwise do it in
2249         ! HW (if enabled).
2250         !
2251         btst    1, %o3
2252         bz,pt   %icc, .dcoh8
2253         btst    7, %o3
2254         !
2255         ! Single byte aligned. Do we do it via HW or via
2256         ! byte for byte? Do a quick no memory reference
2257         ! check to pick up small copies.
2258         !
2259         subcc   %o2, VIS_COPY_THRESHOLD, %o3
2260         bleu,pt %ncc, .dcobcp
2261         sethi   %hi(hw_copy_limit_1), %o3
2262         !
2263         ! Big enough that we need to check the HW limit for
2264         ! this size copy.
2265         !
2266         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
2267         !
2268         ! Is HW copy on? If not, do everything byte for byte.
2269         !
2270         tst     %o3
2271         bz,pn   %icc, .dcobcp
2272         subcc   %o3, %o2, %o3
2273         !
2274         ! If we're less than or equal to the single byte copy limit,
2275         ! bop to the copy loop.
2276         !
2277         bge,pt  %ncc, .dcobcp
2278         nop
2279         !
2280         ! We're big enough and copy is on. Do it with HW.
2281         !
2282         ba,pt   %ncc, .big_copyout
2283         nop
2284 .dcoh8:
2285         !
2286         ! 8 byte aligned?
2287         !
2288         bnz,a   %ncc, .dcoh4
2289         btst    3, %o3
2290         !
2291         ! See if we're in the "small range".
2292         ! If so, go off and do the copy.
2293         ! If not, load the hard limit. %o3 is
2294         ! available for reuse.
2295         !
2296         subcc   %o2, VIS_COPY_THRESHOLD, %o3
2297         bleu,pt %ncc, .dcos8
2298         sethi   %hi(hw_copy_limit_8), %o3
2299         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
2300         !
2301         ! If it's zero, there's no HW bcopy.
2302         ! Bop off to the aligned copy.
2303         !
2304         tst     %o3
2305         bz,pn   %icc, .dcos8
2306         subcc   %o3, %o2, %o3
2307         !
2308         ! We're negative if our size is larger than hw_copy_limit_8.
2309         !
2310         bge,pt  %ncc, .dcos8
2311         nop
2312         !
2313         ! HW assist is on and we're large enough. Do it.
2314         !
2315         ba,pt   %ncc, .big_copyout
2316         nop
2317 .dcos8:
2318         !
2319         ! Housekeeping for copy loops. Uses same idea as in the byte for
2320         ! byte copy loop above.
2321         !
2322         add     %o0, %o2, %o0
2323         add     %o1, %o2, %o1
2324         sub     %g0, %o2, %o3
2325         ba,pt   %ncc, .dodebc
2326         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
2327         !
2328         ! 4 byte aligned?
2329         !
2330 .dcoh4:
2331         bnz,pn  %ncc, .dcoh2
2332         !
2333         ! See if we're in the "small range".
2334         ! If so, go off an do the copy.
2335         ! If not, load the hard limit. %o3 is
2336         ! available for reuse.
2337         !
2338         subcc   %o2, VIS_COPY_THRESHOLD, %o3
2339         bleu,pt %ncc, .dcos4
2340         sethi   %hi(hw_copy_limit_4), %o3
2341         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
2342         !
2343         ! If it's zero, there's no HW bcopy.
2344         ! Bop off to the aligned copy.
2345         !
2346         tst     %o3
2347         bz,pn   %icc, .dcos4
2348         subcc   %o3, %o2, %o3
2349         !
2350         ! We're negative if our size is larger than hw_copy_limit_4.
2351         !
2352         bge,pt  %ncc, .dcos4
2353         nop
2354         !
2355         ! HW assist is on and we're large enough. Do it.
2356         !
2357         ba,pt   %ncc, .big_copyout
2358         nop
2359 .dcos4:
2360         add     %o0, %o2, %o0
2361         add     %o1, %o2, %o1
2362         sub     %g0, %o2, %o3
2363         ba,pt   %ncc, .dodfbc
2364         srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
2365         !
2366         ! We must be 2 byte aligned. Off we go.
2367         ! The check for small copies was done in the
2368         ! delay at .dcoh4
2369         !
2370 .dcoh2:
2371         ble     %ncc, .dcos2
2372         sethi   %hi(hw_copy_limit_2), %o3
2373         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
2374         tst     %o3
2375         bz,pn   %icc, .dcos2
2376         subcc   %o3, %o2, %o3
2377         bge,pt  %ncc, .dcos2
2378         nop
2379         !
2380         ! HW is on and we're big enough. Do it.
2381         !
2382         ba,pt   %ncc, .big_copyout
2383         nop
2384 .dcos2:
2385         add     %o0, %o2, %o0
2386         add     %o1, %o2, %o1
2387         sub     %g0, %o2, %o3
2388         ba,pt   %ncc, .dodtbc
2389         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
2390 .small_copyout:
2391         !
2392         ! Why are we doing this AGAIN? There are certain conditions in
2393         ! big_copyout that will cause us to forego the HW assisted copies
2394         ! and bounce back to a non-HW assisted copy. This dispatches those
2395         ! copies. Note that we branch around this in the main line code.
2396         !
2397         ! We make no check for limits or HW enablement here. We've
2398         ! already been told that we're a poster child so just go off
2399         ! and do it.
2400         !
2401         or      %o0, %o1, %o3
2402         btst    1, %o3
2403         bnz     %icc, .dcobcp           ! Most likely
2404         btst    7, %o3
2405         bz      %icc, .dcos8
2406         btst    3, %o3
2407         bz      %icc, .dcos4
2408         nop
2409         ba,pt   %ncc, .dcos2
2410         nop
2411         .align 32
2412 .dodebc:
2413         ldx     [%o0 + %o3], %o4
2414         deccc   %o2
2415         stxa    %o4, [%o1 + %o3]ASI_USER
2416         bg,pt   %ncc, .dodebc
2417         addcc   %o3, 8, %o3
2418         !
2419         ! End of copy loop. Check to see if we're done. Most
2420         ! eight byte aligned copies end here.
2421         !
2422         bz,pt   %ncc, .dcofh
2423         nop
2424         !
2425         ! Something is left - do it byte for byte.
2426         ! 
2427         ba,pt   %ncc, .dcocl
2428         ldub    [%o0 + %o3], %o4        ! load next byte
2429         !
2430         ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
2431         !
2432         .align 32
2433 .dodfbc:
2434         lduw    [%o0 + %o3], %o4
2435         deccc   %o2
2436         sta     %o4, [%o1 + %o3]ASI_USER
2437         bg,pt   %ncc, .dodfbc
2438         addcc   %o3, 4, %o3
2439         !
2440         ! End of copy loop. Check to see if we're done. Most
2441         ! four byte aligned copies end here.
2442         !
2443         bz,pt   %ncc, .dcofh
2444         nop
2445         !
2446         ! Something is left. Do it byte for byte.
2447         !
2448         ba,pt   %ncc, .dcocl
2449         ldub    [%o0 + %o3], %o4        ! load next byte
2450         !
2451         ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
2452         ! copy.
2453         !
2454         .align 32
2455 .dodtbc:
2456         lduh    [%o0 + %o3], %o4
2457         deccc   %o2
2458         stha    %o4, [%o1 + %o3]ASI_USER
2459         bg,pt   %ncc, .dodtbc
2460         addcc   %o3, 2, %o3
2461         !
2462         ! End of copy loop. Anything left?
2463         !
2464         bz,pt   %ncc, .dcofh
2465         nop
2466         !
2467         ! Deal with the last byte
2468         !
2469         ldub    [%o0 + %o3], %o4
2470         stba    %o4, [%o1 + %o3]ASI_USER
2471 .dcofh:
2472         membar  #Sync
2473         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
2474         retl
2475         clr     %o0
2476 
2477 .big_copyout:
2478         !
2479         ! Are we using the FP registers?
2480         !
2481         rd      %fprs, %o3                      ! check for unused fp
2482         btst    FPRS_FEF, %o3
2483         bnz     %icc, .copyout_fpregs_inuse
2484         nop
2485         !
2486         ! We're going to go off and do a block copy.
2487         ! Switch fault hendlers and grab a window. We
2488         ! don't do a membar #Sync since we've done only
2489         ! kernel data to this point.
2490         !
2491         stn     %o4, [THREAD_REG + T_LOFAULT]
2492         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2493         !
2494         ! %o3 is now %i3. Save original %fprs.
2495         !
2496         st      %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2497         ba,pt   %ncc, .do_block_copyout         ! Not in use. Go off and do it.
2498         wr      %g0, FPRS_FEF, %fprs            ! clear %fprs
2499         !
2500 .copyout_fpregs_inuse:
2501         !
2502         ! We're here if the FP regs are in use. Need to see if the request
2503         ! exceeds our suddenly larger minimum.
2504         !
2505         cmp     %i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
2506         bl      %ncc, .small_copyout
2507           nop
2508         !
2509         ! We're going to go off and do a block copy.
2510         ! Change to the heavy duty fault handler and grab a window first.
2511         !
2512         stn     %o4, [THREAD_REG + T_LOFAULT]
2513         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2514         st      %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2515         !
2516         ! save in-use fpregs on stack
2517         !
2518         wr      %g0, FPRS_FEF, %fprs
2519         membar  #Sync
2520         add     %fp, STACK_BIAS - 257, %o2
2521         and     %o2, -64, %o2
2522         stda    %d0, [%o2]ASI_BLK_P
2523         add     %o2, 64, %o2
2524         stda    %d16, [%o2]ASI_BLK_P
2525         add     %o2, 64, %o2
2526         stda    %d32, [%o2]ASI_BLK_P
2527         add     %o2, 64, %o2
2528         stda    %d48, [%o2]ASI_BLK_P
2529         membar  #Sync
2530 
2531 .do_block_copyout:
2532         membar  #StoreStore|#StoreLoad|#LoadStore
2533 
2534         rd      %gsr, %o2
2535         st      %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
2536 
2537         ! Set the lower bit in the saved t_lofault to indicate
2538         ! that we need to clear the %fprs register on the way
2539         ! out
2540         or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 
2541 
2542         ! Swap src/dst since the code below is memcpy code
2543         ! and memcpy/bcopy have different calling sequences
2544         mov     %i1, %i5
2545         mov     %i0, %i1
2546         mov     %i5, %i0
2547 
2548 !!! This code is nearly identical to the version in the sun4u
2549 !!! libc_psr.  Most bugfixes made to that file should be
2550 !!! merged into this routine.
2551 
2552         andcc   %i0, 7, %o3
2553         bz      %ncc, copyout_blkcpy
2554         sub     %o3, 8, %o3
2555         neg     %o3
2556         sub     %i2, %o3, %i2
2557 
2558         ! Align Destination on double-word boundary
2559 
2560 2:      ldub    [%i1], %o4
2561         inc     %i1
2562         stba    %o4, [%i0]ASI_USER
2563         deccc   %o3
2564         bgu     %ncc, 2b
2565           inc   %i0
2566 copyout_blkcpy:
2567         andcc   %i0, 63, %i3
2568         bz,pn   %ncc, copyout_blalign   ! now block aligned
2569         sub     %i3, 64, %i3
2570         neg     %i3                     ! bytes till block aligned
2571         sub     %i2, %i3, %i2           ! update %i2 with new count
2572 
2573         ! Copy %i3 bytes till dst is block (64 byte) aligned. use
2574         ! double word copies.
2575 
2576         alignaddr %i1, %g0, %g1
2577         ldd     [%g1], %d0
2578         add     %g1, 8, %g1
2579 6:
2580         ldd     [%g1], %d2
2581         add     %g1, 8, %g1
2582         subcc   %i3, 8, %i3
2583         faligndata %d0, %d2, %d8
2584         stda     %d8, [%i0]ASI_USER
2585         add     %i1, 8, %i1
2586         bz,pn   %ncc, copyout_blalign
2587         add     %i0, 8, %i0
2588         ldd     [%g1], %d0
2589         add     %g1, 8, %g1
2590         subcc   %i3, 8, %i3
2591         faligndata %d2, %d0, %d8
2592         stda     %d8, [%i0]ASI_USER
2593         add     %i1, 8, %i1
2594         bgu,pn  %ncc, 6b
2595         add     %i0, 8, %i0
2596  
2597 copyout_blalign:
2598         membar  #StoreLoad
2599         ! %i2 = total length
2600         ! %i3 = blocks  (length - 64) / 64
2601         ! %i4 = doubles remaining  (length - blocks)
2602         sub     %i2, 64, %i3
2603         andn    %i3, 63, %i3
2604         sub     %i2, %i3, %i4
2605         andn    %i4, 7, %i4
2606         sub     %i4, 16, %i4
2607         sub     %i2, %i4, %i2
2608         sub     %i2, %i3, %i2
2609 
2610         andn    %i1, 0x3f, %l7          ! blk aligned address
2611         alignaddr %i1, %g0, %g0         ! gen %gsr
2612 
2613         srl     %i1, 3, %l5             ! bits 3,4,5 are now least sig in  %l5
2614         andcc   %l5, 7, %i5             ! mask everything except bits 1,2 3
2615         add     %i1, %i4, %i1
2616         add     %i1, %i3, %i1
2617 
2618         ldda    [%l7]ASI_BLK_P, %d0
2619         add     %l7, 64, %l7
2620         ldda    [%l7]ASI_BLK_P, %d16
2621         add     %l7, 64, %l7
2622         ldda    [%l7]ASI_BLK_P, %d32
2623         add     %l7, 64, %l7
2624         sub     %i3, 128, %i3
2625 
2626         ! switch statement to get us to the right 8 byte blk within a
2627         ! 64 byte block
2628 
2629         cmp      %i5, 4
2630         bgeu,a   copyout_hlf
2631         cmp      %i5, 6
2632         cmp      %i5, 2
2633         bgeu,a   copyout_sqtr
2634         nop
2635         cmp      %i5, 1
2636         be,a     copyout_seg1
2637         nop
2638         ba,pt    %ncc, copyout_seg0
2639         nop
2640 copyout_sqtr:
2641         be,a     copyout_seg2
2642         nop
2643         ba,pt    %ncc, copyout_seg3
2644         nop
2645 
2646 copyout_hlf:
2647         bgeu,a   copyout_fqtr
2648         nop      
2649         cmp      %i5, 5
2650         be,a     copyout_seg5
2651         nop
2652         ba,pt    %ncc, copyout_seg4
2653         nop
2654 copyout_fqtr:
2655         be,a     copyout_seg6
2656         nop
2657         ba,pt    %ncc, copyout_seg7
2658         nop
2659         
2660 copyout_seg0:
2661         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2662         FALIGN_D0
2663         ldda    [%l7]ASI_BLK_P, %d0
2664         stda    %d48, [%i0]ASI_BLK_AIUS
2665         add     %l7, 64, %l7
2666         subcc   %i3, 64, %i3
2667         bz,pn   %ncc, 0f
2668         add     %i0, 64, %i0
2669         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2670         FALIGN_D16
2671         ldda    [%l7]ASI_BLK_P, %d16
2672         stda    %d48, [%i0]ASI_BLK_AIUS
2673         add     %l7, 64, %l7
2674         subcc   %i3, 64, %i3
2675         bz,pn   %ncc, 1f
2676         add     %i0, 64, %i0
2677         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2678         FALIGN_D32
2679         ldda    [%l7]ASI_BLK_P, %d32
2680         stda    %d48, [%i0]ASI_BLK_AIUS
2681         add     %l7, 64, %l7
2682         subcc   %i3, 64, %i3
2683         bz,pn   %ncc, 2f
2684         add     %i0, 64, %i0
2685         ba,a,pt %ncc, copyout_seg0
2686 
2687 0:
2688         FALIGN_D16
2689         stda    %d48, [%i0]ASI_BLK_AIUS
2690         add     %i0, 64, %i0
2691         membar  #Sync
2692         FALIGN_D32
2693         stda    %d48, [%i0]ASI_BLK_AIUS
2694         ba,pt   %ncc, copyout_blkd0
2695         add     %i0, 64, %i0
2696 
2697 1:
2698         FALIGN_D32
2699         stda    %d48, [%i0]ASI_BLK_AIUS
2700         add     %i0, 64, %i0
2701         membar  #Sync
2702         FALIGN_D0
2703         stda    %d48, [%i0]ASI_BLK_AIUS
2704         ba,pt   %ncc, copyout_blkd16
2705         add     %i0, 64, %i0
2706 
2707 2:
2708         FALIGN_D0
2709         stda    %d48, [%i0]ASI_BLK_AIUS
2710         add     %i0, 64, %i0
2711         membar  #Sync
2712         FALIGN_D16
2713         stda    %d48, [%i0]ASI_BLK_AIUS
2714         ba,pt   %ncc, copyout_blkd32
2715         add     %i0, 64, %i0
2716 
2717 copyout_seg1:
2718         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2719         FALIGN_D2
2720         ldda    [%l7]ASI_BLK_P, %d0
2721         stda    %d48, [%i0]ASI_BLK_AIUS
2722         add     %l7, 64, %l7
2723         subcc   %i3, 64, %i3
2724         bz,pn   %ncc, 0f
2725         add     %i0, 64, %i0
2726         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2727         FALIGN_D18
2728         ldda    [%l7]ASI_BLK_P, %d16
2729         stda    %d48, [%i0]ASI_BLK_AIUS
2730         add     %l7, 64, %l7
2731         subcc   %i3, 64, %i3
2732         bz,pn   %ncc, 1f
2733         add     %i0, 64, %i0
2734         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2735         FALIGN_D34
2736         ldda    [%l7]ASI_BLK_P, %d32
2737         stda    %d48, [%i0]ASI_BLK_AIUS
2738         add     %l7, 64, %l7
2739         subcc   %i3, 64, %i3
2740         bz,pn   %ncc, 2f
2741         add     %i0, 64, %i0
2742         ba,a,pt %ncc, copyout_seg1
2743 0:
2744         FALIGN_D18
2745         stda    %d48, [%i0]ASI_BLK_AIUS
2746         add     %i0, 64, %i0
2747         membar  #Sync
2748         FALIGN_D34
2749         stda    %d48, [%i0]ASI_BLK_AIUS
2750         ba,pt   %ncc, copyout_blkd2
2751         add     %i0, 64, %i0
2752 
2753 1:
2754         FALIGN_D34
2755         stda    %d48, [%i0]ASI_BLK_AIUS
2756         add     %i0, 64, %i0
2757         membar  #Sync
2758         FALIGN_D2
2759         stda    %d48, [%i0]ASI_BLK_AIUS
2760         ba,pt   %ncc, copyout_blkd18
2761         add     %i0, 64, %i0
2762 
2763 2:
2764         FALIGN_D2
2765         stda    %d48, [%i0]ASI_BLK_AIUS
2766         add     %i0, 64, %i0
2767         membar  #Sync
2768         FALIGN_D18
2769         stda    %d48, [%i0]ASI_BLK_AIUS
2770         ba,pt   %ncc, copyout_blkd34
2771         add     %i0, 64, %i0
2772 
2773 copyout_seg2:
2774         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2775         FALIGN_D4
2776         ldda    [%l7]ASI_BLK_P, %d0
2777         stda    %d48, [%i0]ASI_BLK_AIUS
2778         add     %l7, 64, %l7
2779         subcc   %i3, 64, %i3
2780         bz,pn   %ncc, 0f
2781         add     %i0, 64, %i0
2782         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2783         FALIGN_D20
2784         ldda    [%l7]ASI_BLK_P, %d16
2785         stda    %d48, [%i0]ASI_BLK_AIUS
2786         add     %l7, 64, %l7
2787         subcc   %i3, 64, %i3
2788         bz,pn   %ncc, 1f
2789         add     %i0, 64, %i0
2790         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2791         FALIGN_D36
2792         ldda    [%l7]ASI_BLK_P, %d32
2793         stda    %d48, [%i0]ASI_BLK_AIUS
2794         add     %l7, 64, %l7
2795         subcc   %i3, 64, %i3
2796         bz,pn   %ncc, 2f
2797         add     %i0, 64, %i0
2798         ba,a,pt %ncc, copyout_seg2
2799 
2800 0:
2801         FALIGN_D20
2802         stda    %d48, [%i0]ASI_BLK_AIUS
2803         add     %i0, 64, %i0
2804         membar  #Sync
2805         FALIGN_D36
2806         stda    %d48, [%i0]ASI_BLK_AIUS
2807         ba,pt   %ncc, copyout_blkd4
2808         add     %i0, 64, %i0
2809 
2810 1:
2811         FALIGN_D36
2812         stda    %d48, [%i0]ASI_BLK_AIUS
2813         add     %i0, 64, %i0
2814         membar  #Sync
2815         FALIGN_D4
2816         stda    %d48, [%i0]ASI_BLK_AIUS
2817         ba,pt   %ncc, copyout_blkd20
2818         add     %i0, 64, %i0
2819 
2820 2:
2821         FALIGN_D4
2822         stda    %d48, [%i0]ASI_BLK_AIUS
2823         add     %i0, 64, %i0
2824         membar  #Sync
2825         FALIGN_D20
2826         stda    %d48, [%i0]ASI_BLK_AIUS
2827         ba,pt   %ncc, copyout_blkd36
2828         add     %i0, 64, %i0
2829 
2830 copyout_seg3:
2831         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2832         FALIGN_D6
2833         ldda    [%l7]ASI_BLK_P, %d0
2834         stda    %d48, [%i0]ASI_BLK_AIUS
2835         add     %l7, 64, %l7
2836         subcc   %i3, 64, %i3
2837         bz,pn   %ncc, 0f
2838         add     %i0, 64, %i0
2839         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2840         FALIGN_D22
2841         ldda    [%l7]ASI_BLK_P, %d16
2842         stda    %d48, [%i0]ASI_BLK_AIUS
2843         add     %l7, 64, %l7
2844         subcc   %i3, 64, %i3
2845         bz,pn   %ncc, 1f
2846         add     %i0, 64, %i0
2847         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2848         FALIGN_D38
2849         ldda    [%l7]ASI_BLK_P, %d32
2850         stda    %d48, [%i0]ASI_BLK_AIUS
2851         add     %l7, 64, %l7
2852         subcc   %i3, 64, %i3
2853         bz,pn   %ncc, 2f
2854         add     %i0, 64, %i0
2855         ba,a,pt %ncc, copyout_seg3
2856 
2857 0:
2858         FALIGN_D22
2859         stda    %d48, [%i0]ASI_BLK_AIUS
2860         add     %i0, 64, %i0
2861         membar  #Sync
2862         FALIGN_D38
2863         stda    %d48, [%i0]ASI_BLK_AIUS
2864         ba,pt   %ncc, copyout_blkd6
2865         add     %i0, 64, %i0
2866 
2867 1:
2868         FALIGN_D38
2869         stda    %d48, [%i0]ASI_BLK_AIUS
2870         add     %i0, 64, %i0
2871         membar  #Sync
2872         FALIGN_D6
2873         stda    %d48, [%i0]ASI_BLK_AIUS
2874         ba,pt   %ncc, copyout_blkd22
2875         add     %i0, 64, %i0
2876 
2877 2:
2878         FALIGN_D6
2879         stda    %d48, [%i0]ASI_BLK_AIUS
2880         add     %i0, 64, %i0
2881         membar  #Sync
2882         FALIGN_D22
2883         stda    %d48, [%i0]ASI_BLK_AIUS
2884         ba,pt   %ncc, copyout_blkd38
2885         add     %i0, 64, %i0
2886 
2887 copyout_seg4:
2888         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2889         FALIGN_D8
2890         ldda    [%l7]ASI_BLK_P, %d0
2891         stda    %d48, [%i0]ASI_BLK_AIUS
2892         add     %l7, 64, %l7
2893         subcc   %i3, 64, %i3
2894         bz,pn   %ncc, 0f
2895         add     %i0, 64, %i0
2896         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2897         FALIGN_D24
2898         ldda    [%l7]ASI_BLK_P, %d16
2899         stda    %d48, [%i0]ASI_BLK_AIUS
2900         add     %l7, 64, %l7
2901         subcc   %i3, 64, %i3
2902         bz,pn   %ncc, 1f
2903         add     %i0, 64, %i0
2904         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2905         FALIGN_D40
2906         ldda    [%l7]ASI_BLK_P, %d32
2907         stda    %d48, [%i0]ASI_BLK_AIUS
2908         add     %l7, 64, %l7
2909         subcc   %i3, 64, %i3
2910         bz,pn   %ncc, 2f
2911         add     %i0, 64, %i0
2912         ba,a,pt %ncc, copyout_seg4
2913 
2914 0:
2915         FALIGN_D24
2916         stda    %d48, [%i0]ASI_BLK_AIUS
2917         add     %i0, 64, %i0
2918         membar  #Sync
2919         FALIGN_D40
2920         stda    %d48, [%i0]ASI_BLK_AIUS
2921         ba,pt   %ncc, copyout_blkd8
2922         add     %i0, 64, %i0
2923 
2924 1:
2925         FALIGN_D40
2926         stda    %d48, [%i0]ASI_BLK_AIUS
2927         add     %i0, 64, %i0
2928         membar  #Sync
2929         FALIGN_D8
2930         stda    %d48, [%i0]ASI_BLK_AIUS
2931         ba,pt   %ncc, copyout_blkd24
2932         add     %i0, 64, %i0
2933 
2934 2:
2935         FALIGN_D8
2936         stda    %d48, [%i0]ASI_BLK_AIUS
2937         add     %i0, 64, %i0
2938         membar  #Sync
2939         FALIGN_D24
2940         stda    %d48, [%i0]ASI_BLK_AIUS
2941         ba,pt   %ncc, copyout_blkd40
2942         add     %i0, 64, %i0
2943 
2944 copyout_seg5:
2945         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2946         FALIGN_D10
2947         ldda    [%l7]ASI_BLK_P, %d0
2948         stda    %d48, [%i0]ASI_BLK_AIUS
2949         add     %l7, 64, %l7
2950         subcc   %i3, 64, %i3
2951         bz,pn   %ncc, 0f
2952         add     %i0, 64, %i0
2953         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2954         FALIGN_D26
2955         ldda    [%l7]ASI_BLK_P, %d16
2956         stda    %d48, [%i0]ASI_BLK_AIUS
2957         add     %l7, 64, %l7
2958         subcc   %i3, 64, %i3
2959         bz,pn   %ncc, 1f
2960         add     %i0, 64, %i0
2961         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2962         FALIGN_D42
2963         ldda    [%l7]ASI_BLK_P, %d32
2964         stda    %d48, [%i0]ASI_BLK_AIUS
2965         add     %l7, 64, %l7
2966         subcc   %i3, 64, %i3
2967         bz,pn   %ncc, 2f
2968         add     %i0, 64, %i0
2969         ba,a,pt %ncc, copyout_seg5
2970 
2971 0:
2972         FALIGN_D26
2973         stda    %d48, [%i0]ASI_BLK_AIUS
2974         add     %i0, 64, %i0
2975         membar  #Sync
2976         FALIGN_D42
2977         stda    %d48, [%i0]ASI_BLK_AIUS
2978         ba,pt   %ncc, copyout_blkd10
2979         add     %i0, 64, %i0
2980 
2981 1:
2982         FALIGN_D42
2983         stda    %d48, [%i0]ASI_BLK_AIUS
2984         add     %i0, 64, %i0
2985         membar  #Sync
2986         FALIGN_D10
2987         stda    %d48, [%i0]ASI_BLK_AIUS
2988         ba,pt   %ncc, copyout_blkd26
2989         add     %i0, 64, %i0
2990 
2991 2:
2992         FALIGN_D10
2993         stda    %d48, [%i0]ASI_BLK_AIUS
2994         add     %i0, 64, %i0
2995         membar  #Sync
2996         FALIGN_D26
2997         stda    %d48, [%i0]ASI_BLK_AIUS
2998         ba,pt   %ncc, copyout_blkd42
2999         add     %i0, 64, %i0
3000 
3001 copyout_seg6:
3002         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3003         FALIGN_D12
3004         ldda    [%l7]ASI_BLK_P, %d0
3005         stda    %d48, [%i0]ASI_BLK_AIUS
3006         add     %l7, 64, %l7
3007         subcc   %i3, 64, %i3
3008         bz,pn   %ncc, 0f
3009         add     %i0, 64, %i0
3010         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3011         FALIGN_D28
3012         ldda    [%l7]ASI_BLK_P, %d16
3013         stda    %d48, [%i0]ASI_BLK_AIUS
3014         add     %l7, 64, %l7
3015         subcc   %i3, 64, %i3
3016         bz,pn   %ncc, 1f
3017         add     %i0, 64, %i0
3018         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3019         FALIGN_D44
3020         ldda    [%l7]ASI_BLK_P, %d32
3021         stda    %d48, [%i0]ASI_BLK_AIUS
3022         add     %l7, 64, %l7
3023         subcc   %i3, 64, %i3
3024         bz,pn   %ncc, 2f
3025         add     %i0, 64, %i0
3026         ba,a,pt %ncc, copyout_seg6
3027 
3028 0:
3029         FALIGN_D28
3030         stda    %d48, [%i0]ASI_BLK_AIUS
3031         add     %i0, 64, %i0
3032         membar  #Sync
3033         FALIGN_D44
3034         stda    %d48, [%i0]ASI_BLK_AIUS
3035         ba,pt   %ncc, copyout_blkd12
3036         add     %i0, 64, %i0
3037 
3038 1:
3039         FALIGN_D44
3040         stda    %d48, [%i0]ASI_BLK_AIUS
3041         add     %i0, 64, %i0
3042         membar  #Sync
3043         FALIGN_D12
3044         stda    %d48, [%i0]ASI_BLK_AIUS
3045         ba,pt   %ncc, copyout_blkd28
3046         add     %i0, 64, %i0
3047 
3048 2:
3049         FALIGN_D12
3050         stda    %d48, [%i0]ASI_BLK_AIUS
3051         add     %i0, 64, %i0
3052         membar  #Sync
3053         FALIGN_D28
3054         stda    %d48, [%i0]ASI_BLK_AIUS
3055         ba,pt   %ncc, copyout_blkd44
3056         add     %i0, 64, %i0
3057 
3058 copyout_seg7:
3059         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3060         FALIGN_D14
3061         ldda    [%l7]ASI_BLK_P, %d0
3062         stda    %d48, [%i0]ASI_BLK_AIUS
3063         add     %l7, 64, %l7
3064         subcc   %i3, 64, %i3
3065         bz,pn   %ncc, 0f
3066         add     %i0, 64, %i0
3067         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3068         FALIGN_D30
3069         ldda    [%l7]ASI_BLK_P, %d16
3070         stda    %d48, [%i0]ASI_BLK_AIUS
3071         add     %l7, 64, %l7
3072         subcc   %i3, 64, %i3
3073         bz,pn   %ncc, 1f
3074         add     %i0, 64, %i0
3075         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3076         FALIGN_D46
3077         ldda    [%l7]ASI_BLK_P, %d32
3078         stda    %d48, [%i0]ASI_BLK_AIUS
3079         add     %l7, 64, %l7
3080         subcc   %i3, 64, %i3
3081         bz,pn   %ncc, 2f
3082         add     %i0, 64, %i0
3083         ba,a,pt %ncc, copyout_seg7
3084 
3085 0:
3086         FALIGN_D30
3087         stda    %d48, [%i0]ASI_BLK_AIUS
3088         add     %i0, 64, %i0
3089         membar  #Sync
3090         FALIGN_D46
3091         stda    %d48, [%i0]ASI_BLK_AIUS
3092         ba,pt   %ncc, copyout_blkd14
3093         add     %i0, 64, %i0
3094 
3095 1:
3096         FALIGN_D46
3097         stda    %d48, [%i0]ASI_BLK_AIUS
3098         add     %i0, 64, %i0
3099         membar  #Sync
3100         FALIGN_D14
3101         stda    %d48, [%i0]ASI_BLK_AIUS
3102         ba,pt   %ncc, copyout_blkd30
3103         add     %i0, 64, %i0
3104 
3105 2:
3106         FALIGN_D14
3107         stda    %d48, [%i0]ASI_BLK_AIUS
3108         add     %i0, 64, %i0
3109         membar  #Sync
3110         FALIGN_D30
3111         stda    %d48, [%i0]ASI_BLK_AIUS
3112         ba,pt   %ncc, copyout_blkd46
3113         add     %i0, 64, %i0
3114 
3115 
3116         !
3117         ! dribble out the last partial block
3118         !
3119 copyout_blkd0:
3120         subcc   %i4, 8, %i4
3121         blu,pn  %ncc, copyout_blkdone
3122         faligndata %d0, %d2, %d48
3123         stda    %d48, [%i0]ASI_USER
3124         add     %i0, 8, %i0
3125 copyout_blkd2:
3126         subcc   %i4, 8, %i4
3127         blu,pn  %ncc, copyout_blkdone
3128         faligndata %d2, %d4, %d48
3129         stda    %d48, [%i0]ASI_USER
3130         add     %i0, 8, %i0
3131 copyout_blkd4:
3132         subcc   %i4, 8, %i4
3133         blu,pn  %ncc, copyout_blkdone
3134         faligndata %d4, %d6, %d48
3135         stda    %d48, [%i0]ASI_USER
3136         add     %i0, 8, %i0
3137 copyout_blkd6:
3138         subcc   %i4, 8, %i4
3139         blu,pn  %ncc, copyout_blkdone
3140         faligndata %d6, %d8, %d48
3141         stda    %d48, [%i0]ASI_USER
3142         add     %i0, 8, %i0
3143 copyout_blkd8:
3144         subcc   %i4, 8, %i4
3145         blu,pn  %ncc, copyout_blkdone
3146         faligndata %d8, %d10, %d48
3147         stda    %d48, [%i0]ASI_USER
3148         add     %i0, 8, %i0
3149 copyout_blkd10:
3150         subcc   %i4, 8, %i4
3151         blu,pn  %ncc, copyout_blkdone
3152         faligndata %d10, %d12, %d48
3153         stda    %d48, [%i0]ASI_USER
3154         add     %i0, 8, %i0
3155 copyout_blkd12:
3156         subcc   %i4, 8, %i4
3157         blu,pn  %ncc, copyout_blkdone
3158         faligndata %d12, %d14, %d48
3159         stda    %d48, [%i0]ASI_USER
3160         add     %i0, 8, %i0
3161 copyout_blkd14:
3162         subcc   %i4, 8, %i4
3163         blu,pn  %ncc, copyout_blkdone
3164         fsrc1   %d14, %d0
3165         ba,a,pt %ncc, copyout_blkleft
3166 
3167 copyout_blkd16:
3168         subcc   %i4, 8, %i4
3169         blu,pn  %ncc, copyout_blkdone
3170         faligndata %d16, %d18, %d48
3171         stda    %d48, [%i0]ASI_USER
3172         add     %i0, 8, %i0
3173 copyout_blkd18:
3174         subcc   %i4, 8, %i4
3175         blu,pn  %ncc, copyout_blkdone
3176         faligndata %d18, %d20, %d48
3177         stda    %d48, [%i0]ASI_USER
3178         add     %i0, 8, %i0
3179 copyout_blkd20:
3180         subcc   %i4, 8, %i4
3181         blu,pn  %ncc, copyout_blkdone
3182         faligndata %d20, %d22, %d48
3183         stda    %d48, [%i0]ASI_USER
3184         add     %i0, 8, %i0
3185 copyout_blkd22:
3186         subcc   %i4, 8, %i4
3187         blu,pn  %ncc, copyout_blkdone
3188         faligndata %d22, %d24, %d48
3189         stda    %d48, [%i0]ASI_USER
3190         add     %i0, 8, %i0
3191 copyout_blkd24:
3192         subcc   %i4, 8, %i4
3193         blu,pn  %ncc, copyout_blkdone
3194         faligndata %d24, %d26, %d48
3195         stda    %d48, [%i0]ASI_USER
3196         add     %i0, 8, %i0
3197 copyout_blkd26:
3198         subcc   %i4, 8, %i4
3199         blu,pn  %ncc, copyout_blkdone
3200         faligndata %d26, %d28, %d48
3201         stda    %d48, [%i0]ASI_USER
3202         add     %i0, 8, %i0
3203 copyout_blkd28:
3204         subcc   %i4, 8, %i4
3205         blu,pn  %ncc, copyout_blkdone
3206         faligndata %d28, %d30, %d48
3207         stda    %d48, [%i0]ASI_USER
3208         add     %i0, 8, %i0
3209 copyout_blkd30:
3210         subcc   %i4, 8, %i4
3211         blu,pn  %ncc, copyout_blkdone
3212         fsrc1   %d30, %d0
3213         ba,a,pt %ncc, copyout_blkleft
3214 copyout_blkd32:
3215         subcc   %i4, 8, %i4
3216         blu,pn  %ncc, copyout_blkdone
3217         faligndata %d32, %d34, %d48
3218         stda    %d48, [%i0]ASI_USER
3219         add     %i0, 8, %i0
3220 copyout_blkd34:
3221         subcc   %i4, 8, %i4
3222         blu,pn  %ncc, copyout_blkdone
3223         faligndata %d34, %d36, %d48
3224         stda    %d48, [%i0]ASI_USER
3225         add     %i0, 8, %i0
3226 copyout_blkd36:
3227         subcc   %i4, 8, %i4
3228         blu,pn  %ncc, copyout_blkdone
3229         faligndata %d36, %d38, %d48
3230         stda    %d48, [%i0]ASI_USER
3231         add     %i0, 8, %i0
3232 copyout_blkd38:
3233         subcc   %i4, 8, %i4
3234         blu,pn  %ncc, copyout_blkdone
3235         faligndata %d38, %d40, %d48
3236         stda    %d48, [%i0]ASI_USER
3237         add     %i0, 8, %i0
3238 copyout_blkd40:
3239         subcc   %i4, 8, %i4
3240         blu,pn  %ncc, copyout_blkdone
3241         faligndata %d40, %d42, %d48
3242         stda    %d48, [%i0]ASI_USER
3243         add     %i0, 8, %i0
3244 copyout_blkd42:
3245         subcc   %i4, 8, %i4
3246         blu,pn  %ncc, copyout_blkdone
3247         faligndata %d42, %d44, %d48
3248         stda    %d48, [%i0]ASI_USER
3249         add     %i0, 8, %i0
3250 copyout_blkd44:
3251         subcc   %i4, 8, %i4
3252         blu,pn  %ncc, copyout_blkdone
3253         faligndata %d44, %d46, %d48
3254         stda    %d48, [%i0]ASI_USER
3255         add     %i0, 8, %i0
3256 copyout_blkd46:
3257         subcc   %i4, 8, %i4
3258         blu,pn  %ncc, copyout_blkdone
3259         fsrc1   %d46, %d0
3260 
3261 copyout_blkleft:
3262 1:
3263         ldd     [%l7], %d2
3264         add     %l7, 8, %l7
3265         subcc   %i4, 8, %i4
3266         faligndata %d0, %d2, %d8
3267         stda    %d8, [%i0]ASI_USER
3268         blu,pn  %ncc, copyout_blkdone
3269         add     %i0, 8, %i0
3270         ldd     [%l7], %d0
3271         add     %l7, 8, %l7
3272         subcc   %i4, 8, %i4
3273         faligndata %d2, %d0, %d8
3274         stda    %d8, [%i0]ASI_USER
3275         bgeu,pt %ncc, 1b
3276         add     %i0, 8, %i0
3277 
3278 copyout_blkdone:
3279         tst     %i2
3280         bz,pt   %ncc, .copyout_exit
3281         and     %l3, 0x4, %l3           ! fprs.du = fprs.dl = 0
3282 
3283 7:      ldub    [%i1], %i4
3284         inc     %i1
3285         stba    %i4, [%i0]ASI_USER
3286         inc     %i0
3287         deccc   %i2
3288         bgu     %ncc, 7b
3289           nop
3290 
3291 .copyout_exit:
3292         membar  #StoreLoad|#StoreStore
3293         btst    FPUSED_FLAG, SAVED_LOFAULT
3294         bz      1f
3295           nop
3296 
3297         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
3298         wr      %o2, 0, %gsr            ! restore gsr
3299 
3300         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3301         btst    FPRS_FEF, %o3
3302         bz      4f
3303           nop
3304 
3305         ! restore fpregs from stack
3306         membar  #Sync
3307         add     %fp, STACK_BIAS - 257, %o2
3308         and     %o2, -64, %o2
3309         ldda    [%o2]ASI_BLK_P, %d0
3310         add     %o2, 64, %o2
3311         ldda    [%o2]ASI_BLK_P, %d16
3312         add     %o2, 64, %o2
3313         ldda    [%o2]ASI_BLK_P, %d32
3314         add     %o2, 64, %o2
3315         ldda    [%o2]ASI_BLK_P, %d48
3316         membar  #Sync
3317 
3318         ba,pt   %ncc, 1f
3319           wr    %o3, 0, %fprs           ! restore fprs
3320 
3321 4:
3322         FZERO                           ! zero all of the fpregs
3323         wr      %o3, 0, %fprs           ! restore fprs
3324 
3325 1:
3326         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3327         membar  #Sync                   ! sync error barrier
3328         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
3329         ret
3330         restore %g0, 0, %o0
3331 
3332 .copyout_err:
3333         ldn     [THREAD_REG + T_COPYOPS], %o4
3334         brz     %o4, 2f
3335         nop
3336         ldn     [%o4 + CP_COPYOUT], %g2
3337         jmp     %g2
3338         nop
3339 2:
3340         retl
3341         mov     -1, %o0
3342         SET_SIZE(copyout)
3343 
3344 
3345         ENTRY(xcopyout)
3346         sethi   %hi(.xcopyout_err), REAL_LOFAULT
3347         b       .do_copyout
3348           or    REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
3349 .xcopyout_err:
3350         ldn     [THREAD_REG + T_COPYOPS], %o4
3351         brz     %o4, 2f
3352         nop
3353         ldn     [%o4 + CP_XCOPYOUT], %g2
3354         jmp     %g2
3355         nop
3356 2:
3357         retl
3358         mov     %g1, %o0
3359         SET_SIZE(xcopyout)
3360 
3361         ENTRY(xcopyout_little)
3362         sethi   %hi(.little_err), %o4
3363         ldn     [THREAD_REG + T_LOFAULT], %o5
3364         or      %o4, %lo(.little_err), %o4
3365         membar  #Sync                   ! sync error barrier
3366         stn     %o4, [THREAD_REG + T_LOFAULT]
3367 
3368         subcc   %g0, %o2, %o3
3369         add     %o0, %o2, %o0
3370         bz,pn   %ncc, 2f                ! check for zero bytes
3371         sub     %o2, 1, %o4
3372         add     %o0, %o4, %o0           ! start w/last byte
3373         add     %o1, %o2, %o1
3374         ldub    [%o0+%o3], %o4
3375 
3376 1:      stba    %o4, [%o1+%o3]ASI_AIUSL
3377         inccc   %o3
3378         sub     %o0, 2, %o0             ! get next byte
3379         bcc,a,pt %ncc, 1b
3380           ldub  [%o0+%o3], %o4
3381 
3382 2:      membar  #Sync                   ! sync error barrier
3383         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3384         retl
3385         mov     %g0, %o0                ! return (0)
3386         SET_SIZE(xcopyout_little)
3387 
3388 /*
3389  * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
3390  */
3391 
3392         ENTRY(copyin)
3393         sethi   %hi(.copyin_err), REAL_LOFAULT
3394         or      REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
3395 
3396 .do_copyin:
3397         !
3398         ! Check the length and bail if zero.
3399         !
3400         tst     %o2
3401         bnz,pt  %ncc, 1f
3402           nop
3403         retl
3404           clr   %o0
3405 1:
3406         sethi   %hi(copyio_fault), %o4
3407         or      %o4, %lo(copyio_fault), %o4
3408         sethi   %hi(copyio_fault_nowindow), %o3
3409         ldn     [THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3410         or      %o3, %lo(copyio_fault_nowindow), %o3
3411         membar  #Sync
3412         stn     %o3, [THREAD_REG + T_LOFAULT]
3413 
3414         mov     %o0, SAVE_SRC
3415         mov     %o1, SAVE_DST
3416         mov     %o2, SAVE_COUNT
3417 
3418         !
3419         ! Check to see if we're more than SMALL_LIMIT.
3420         !
3421         subcc   %o2, SMALL_LIMIT, %o3
3422         bgu,a,pt %ncc, .dci_ns
3423         or      %o0, %o1, %o3
3424         !
3425         ! What was previously ".small_copyin"
3426         !
3427 .dcibcp:
3428         sub     %g0, %o2, %o3           ! setup for copy loop
3429         add     %o0, %o2, %o0
3430         add     %o1, %o2, %o1
3431         ba,pt   %ncc, .dcicl
3432         lduba   [%o0 + %o3]ASI_USER, %o4
3433         !
3434         ! %o0 and %o1 point at the end and remain pointing at the end
3435         ! of their buffers. We pull things out by adding %o3 (which is
3436         ! the negation of the length) to the buffer end which gives us
3437         ! the curent location in the buffers. By incrementing %o3 we walk
3438         ! through both buffers without having to bump each buffer's
3439         ! pointer. A very fast 4 instruction loop.
3440         !
3441         .align 16
3442 .dcicl:
3443         stb     %o4, [%o1 + %o3]
3444         inccc   %o3
3445         bl,a,pt %ncc, .dcicl
3446         lduba   [%o0 + %o3]ASI_USER, %o4
3447         !
3448         ! We're done. Go home.
3449         !       
3450         membar  #Sync
3451         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3452         retl
3453         clr     %o0
3454         !
3455         ! Try aligned copies from here.
3456         !
3457 .dci_ns:
3458         !
3459         ! See if we're single byte aligned. If we are, check the
3460         ! limit for single byte copies. If we're smaller, or equal,
3461         ! bounce to the byte for byte copy loop. Otherwise do it in
3462         ! HW (if enabled).
3463         !
3464         btst    1, %o3
3465         bz,a,pt %icc, .dcih8
3466         btst    7, %o3
3467         !
3468         ! We're single byte aligned.
3469         !
3470         subcc   %o2, VIS_COPY_THRESHOLD, %o3
3471         bleu,pt %ncc, .dcibcp
3472         sethi   %hi(hw_copy_limit_1), %o3
3473         ld      [%o3 + %lo(hw_copy_limit_1)], %o3
3474         !
3475         ! Is HW copy on? If not do everything byte for byte.
3476         !
3477         tst     %o3
3478         bz,pn   %icc, .dcibcp
3479         subcc   %o3, %o2, %o3
3480         !
3481         ! Are we bigger than the HW limit? If not
3482         ! go to byte for byte.
3483         !
3484         bge,pt  %ncc, .dcibcp
3485         nop
3486         !
3487         ! We're big enough and copy is on. Do it with HW.
3488         !
3489         ba,pt   %ncc, .big_copyin
3490         nop
3491 .dcih8:
3492         !
3493         ! 8 byte aligned?
3494         !
3495         bnz,a   %ncc, .dcih4
3496         btst    3, %o3
3497         !
3498         ! We're eight byte aligned.
3499         !
3500         subcc   %o2, VIS_COPY_THRESHOLD, %o3
3501         bleu,pt %ncc, .dcis8
3502         sethi   %hi(hw_copy_limit_8), %o3
3503         ld      [%o3 + %lo(hw_copy_limit_8)], %o3
3504         !
3505         ! Is HW assist on? If not, do it with the aligned copy.
3506         !
3507         tst     %o3
3508         bz,pn   %icc, .dcis8
3509         subcc   %o3, %o2, %o3
3510         bge     %ncc, .dcis8
3511         nop
3512         ba,pt   %ncc, .big_copyin
3513         nop
3514 .dcis8:
3515         !
3516         ! Housekeeping for copy loops. Uses same idea as in the byte for
3517         ! byte copy loop above.
3518         !
3519         add     %o0, %o2, %o0
3520         add     %o1, %o2, %o1
3521         sub     %g0, %o2, %o3
3522         ba,pt   %ncc, .didebc
3523         srl     %o2, 3, %o2             ! Number of 8 byte chunks to copy
3524         !
3525         ! 4 byte aligned?
3526         !
3527 .dcih4:
3528         bnz     %ncc, .dcih2
3529         subcc   %o2, VIS_COPY_THRESHOLD, %o3
3530         bleu,pt %ncc, .dcis4
3531         sethi   %hi(hw_copy_limit_4), %o3
3532         ld      [%o3 + %lo(hw_copy_limit_4)], %o3
3533         !
3534         ! Is HW assist on? If not, do it with the aligned copy.
3535         !
3536         tst     %o3
3537         bz,pn   %icc, .dcis4
3538         subcc   %o3, %o2, %o3
3539         !
3540         ! We're negative if our size is less than or equal to hw_copy_limit_4.
3541         !
3542         bge     %ncc, .dcis4
3543         nop
3544         ba,pt   %ncc, .big_copyin
3545         nop
3546 .dcis4:
3547         !
3548         ! Housekeeping for copy loops. Uses same idea as in the byte
3549         ! for byte copy loop above.
3550         !
3551         add     %o0, %o2, %o0
3552         add     %o1, %o2, %o1
3553         sub     %g0, %o2, %o3
3554         ba,pt   %ncc, .didfbc
3555         srl     %o2, 2, %o2             ! Number of 4 byte chunks to copy
3556 .dcih2:
3557         !
3558         ! We're two byte aligned. Check for "smallness"
3559         ! done in delay at .dcih4
3560         !
3561         bleu,pt %ncc, .dcis2
3562         sethi   %hi(hw_copy_limit_2), %o3
3563         ld      [%o3 + %lo(hw_copy_limit_2)], %o3
3564         !
3565         ! Is HW assist on? If not, do it with the aligned copy.
3566         !
3567         tst     %o3
3568         bz,pn   %icc, .dcis2
3569         subcc   %o3, %o2, %o3
3570         !
3571         ! Are we larger than the HW limit?
3572         !
3573         bge     %ncc, .dcis2
3574         nop
3575         !
3576         ! HW assist is on and we're large enough to use it.
3577         !
3578         ba,pt   %ncc, .big_copyin
3579         nop
3580         !
3581         ! Housekeeping for copy loops. Uses same idea as in the byte
3582         ! for byte copy loop above.
3583         !
3584 .dcis2:
3585         add     %o0, %o2, %o0
3586         add     %o1, %o2, %o1
3587         sub     %g0, %o2, %o3
3588         ba,pt   %ncc, .didtbc
3589         srl     %o2, 1, %o2             ! Number of 2 byte chunks to copy
3590         !
3591 .small_copyin:
3592         !
3593         ! Why are we doing this AGAIN? There are certain conditions in
3594         ! big copyin that will cause us to forgo the HW assisted copys
3595         ! and bounce back to a non-hw assisted copy. This dispatches
3596         ! those copies. Note that we branch around this in the main line
3597         ! code.
3598         !
3599         ! We make no check for limits or HW enablement here. We've
3600         ! already been told that we're a poster child so just go off
3601         ! and do it.
3602         !
3603         or      %o0, %o1, %o3
3604         btst    1, %o3
3605         bnz     %icc, .dcibcp           ! Most likely
3606         btst    7, %o3
3607         bz      %icc, .dcis8
3608         btst    3, %o3
3609         bz      %icc, .dcis4
3610         nop
3611         ba,pt   %ncc, .dcis2
3612         nop
3613         !
3614         ! Eight byte aligned copies. A steal from the original .small_copyin
3615         ! with modifications. %o2 is number of 8 byte chunks to copy. When
3616         ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
3617         ! to copy.
3618         !
3619         .align 32
3620 .didebc:
3621         ldxa    [%o0 + %o3]ASI_USER, %o4
3622         deccc   %o2
3623         stx     %o4, [%o1 + %o3]
3624         bg,pt   %ncc, .didebc
3625         addcc   %o3, 8, %o3
3626         !
3627         ! End of copy loop. Most 8 byte aligned copies end here.
3628         !
3629         bz,pt   %ncc, .dcifh
3630         nop
3631         !
3632         ! Something is left. Do it byte for byte.
3633         !
3634         ba,pt   %ncc, .dcicl
3635         lduba   [%o0 + %o3]ASI_USER, %o4
3636         !
3637         ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
3638         !
3639         .align 32
3640 .didfbc:
3641         lduwa   [%o0 + %o3]ASI_USER, %o4
3642         deccc   %o2
3643         st      %o4, [%o1 + %o3]
3644         bg,pt   %ncc, .didfbc
3645         addcc   %o3, 4, %o3
3646         !
3647         ! End of copy loop. Most 4 byte aligned copies end here.
3648         !
3649         bz,pt   %ncc, .dcifh
3650         nop
3651         !
3652         ! Something is left. Do it byte for byte.
3653         !
3654         ba,pt   %ncc, .dcicl
3655         lduba   [%o0 + %o3]ASI_USER, %o4
3656         !
3657         ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
3658         ! copy.
3659         !
3660         .align 32
3661 .didtbc:
3662         lduha   [%o0 + %o3]ASI_USER, %o4
3663         deccc   %o2
3664         sth     %o4, [%o1 + %o3]
3665         bg,pt   %ncc, .didtbc
3666         addcc   %o3, 2, %o3
3667         !
3668         ! End of copy loop. Most 2 byte aligned copies end here.
3669         !
3670         bz,pt   %ncc, .dcifh
3671         nop
3672         !
3673         ! Deal with the last byte
3674         !
3675         lduba   [%o0 + %o3]ASI_USER, %o4
3676         stb     %o4, [%o1 + %o3]
3677 .dcifh:
3678         membar  #Sync
3679         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3680         retl
3681         clr     %o0
3682 
3683 .big_copyin:
3684         !
3685         ! Are we using the FP registers?
3686         !
3687         rd      %fprs, %o3              ! check for unused fp
3688         btst    FPRS_FEF, %o3
3689         bnz     %ncc, .copyin_fpregs_inuse
3690         nop
3691         !
3692         ! We're going off to do a block copy.
3693         ! Switch fault hendlers and grab a window. We
3694         ! don't do a membar #Sync since we've done only
3695         ! kernel data to this point.
3696         !
3697         stn     %o4, [THREAD_REG + T_LOFAULT]
3698         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3699         !
3700         ! %o3 is %i3 after the save...
3701         !
3702         st      %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
3703         ba,pt   %ncc, .do_blockcopyin
3704         wr      %g0, FPRS_FEF, %fprs
3705 .copyin_fpregs_inuse:
3706         !
3707         ! We're here if the FP regs are in use. Need to see if the request
3708         ! exceeds our suddenly larger minimum.
3709         !
3710         cmp     %i2, VIS_COPY_THRESHOLD+(64*4)
3711         bl      %ncc, .small_copyin
3712         nop
3713         !
3714         ! We're going off and do a block copy.
3715         ! Change to the heavy duty fault handler and grab a window first.
3716         ! New handler is passed in
3717         !
3718         stn     %o4, [THREAD_REG + T_LOFAULT]
3719         save    %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3720         !
3721         ! %o3 is now %i3
3722         !
3723         st      %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
3724 
3725         ! save in-use fpregs on stack
3726         wr      %g0, FPRS_FEF, %fprs
3727         membar  #Sync
3728         add     %fp, STACK_BIAS - 257, %o2
3729         and     %o2, -64, %o2
3730         stda    %d0, [%o2]ASI_BLK_P
3731         add     %o2, 64, %o2
3732         stda    %d16, [%o2]ASI_BLK_P
3733         add     %o2, 64, %o2
3734         stda    %d32, [%o2]ASI_BLK_P
3735         add     %o2, 64, %o2
3736         stda    %d48, [%o2]ASI_BLK_P
3737         membar  #Sync
3738 
3739 .do_blockcopyin:
3740         membar  #StoreStore|#StoreLoad|#LoadStore
3741 
3742         rd      %gsr, %o2
3743         st      %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
3744 
3745         ! Set the lower bit in the saved t_lofault to indicate
3746         ! that we need to clear the %fprs register on the way
3747         ! out
3748         or      SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3749 
3750         ! Swap src/dst since the code below is memcpy code
3751         ! and memcpy/bcopy have different calling sequences
3752         mov     %i1, %i5
3753         mov     %i0, %i1
3754         mov     %i5, %i0
3755 
3756 !!! This code is nearly identical to the version in the sun4u
3757 !!! libc_psr.  Most bugfixes made to that file should be
3758 !!! merged into this routine.
3759 
3760         andcc   %i0, 7, %o3
3761         bz      copyin_blkcpy
3762         sub     %o3, 8, %o3
3763         neg     %o3
3764         sub     %i2, %o3, %i2
3765 
3766         ! Align Destination on double-word boundary
3767 
3768 2:      lduba   [%i1]ASI_USER, %o4
3769         inc     %i1
3770         inc     %i0
3771         deccc   %o3
3772         bgu     %ncc, 2b
3773         stb     %o4, [%i0-1]
3774 copyin_blkcpy:
3775         andcc   %i0, 63, %i3
3776         bz,pn   %ncc, copyin_blalign    ! now block aligned
3777         sub     %i3, 64, %i3
3778         neg     %i3                     ! bytes till block aligned
3779         sub     %i2, %i3, %i2           ! update %i2 with new count
3780 
3781         ! Copy %i3 bytes till dst is block (64 byte) aligned. use
3782         ! double word copies.
3783 
3784         alignaddr %i1, %g0, %g1
3785         ldda    [%g1]ASI_USER, %d0
3786         add     %g1, 8, %g1
3787 6:
3788         ldda    [%g1]ASI_USER, %d2
3789         add     %g1, 8, %g1
3790         subcc   %i3, 8, %i3
3791         faligndata %d0, %d2, %d8
3792         std     %d8, [%i0]
3793         add     %i1, 8, %i1
3794         bz,pn   %ncc, copyin_blalign
3795         add     %i0, 8, %i0
3796         ldda    [%g1]ASI_USER, %d0
3797         add     %g1, 8, %g1
3798         subcc   %i3, 8, %i3
3799         faligndata %d2, %d0, %d8
3800         std     %d8, [%i0]
3801         add     %i1, 8, %i1
3802         bgu,pn  %ncc, 6b
3803         add     %i0, 8, %i0
3804  
3805 copyin_blalign:
3806         membar  #StoreLoad
3807         ! %i2 = total length
3808         ! %i3 = blocks  (length - 64) / 64
3809         ! %i4 = doubles remaining  (length - blocks)
3810         sub     %i2, 64, %i3
3811         andn    %i3, 63, %i3
3812         sub     %i2, %i3, %i4
3813         andn    %i4, 7, %i4
3814         sub     %i4, 16, %i4
3815         sub     %i2, %i4, %i2
3816         sub     %i2, %i3, %i2
3817 
3818         andn    %i1, 0x3f, %l7          ! blk aligned address
3819         alignaddr %i1, %g0, %g0         ! gen %gsr
3820 
3821         srl     %i1, 3, %l5             ! bits 3,4,5 are now least sig in  %l5
3822         andcc   %l5, 7, %i5             ! mask everything except bits 1,2 3
3823         add     %i1, %i4, %i1
3824         add     %i1, %i3, %i1
3825 
3826         ldda    [%l7]ASI_BLK_AIUS, %d0
3827         add     %l7, 64, %l7
3828         ldda    [%l7]ASI_BLK_AIUS, %d16
3829         add     %l7, 64, %l7
3830         ldda    [%l7]ASI_BLK_AIUS, %d32
3831         add     %l7, 64, %l7
3832         sub     %i3, 128, %i3
3833 
3834         ! switch statement to get us to the right 8 byte blk within a
3835         ! 64 byte block
3836 
3837         cmp      %i5, 4
3838         bgeu,a   copyin_hlf
3839         cmp      %i5, 6
3840         cmp      %i5, 2
3841         bgeu,a   copyin_sqtr
3842         nop
3843         cmp      %i5, 1
3844         be,a     copyin_seg1
3845         nop
3846         ba,pt    %ncc, copyin_seg0
3847         nop
3848 copyin_sqtr:
3849         be,a     copyin_seg2
3850         nop
3851         ba,pt    %ncc, copyin_seg3
3852         nop
3853 
3854 copyin_hlf:
3855         bgeu,a   copyin_fqtr
3856         nop      
3857         cmp      %i5, 5
3858         be,a     copyin_seg5
3859         nop
3860         ba,pt    %ncc, copyin_seg4
3861         nop
3862 copyin_fqtr:
3863         be,a     copyin_seg6
3864         nop
3865         ba,pt    %ncc, copyin_seg7
3866         nop
3867         
3868 copyin_seg0:
3869         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3870         FALIGN_D0
3871         ldda    [%l7]ASI_BLK_AIUS, %d0
3872         stda    %d48, [%i0]ASI_BLK_P
3873         add     %l7, 64, %l7
3874         subcc   %i3, 64, %i3
3875         bz,pn   %ncc, 0f
3876         add     %i0, 64, %i0
3877         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3878         FALIGN_D16
3879         ldda    [%l7]ASI_BLK_AIUS, %d16
3880         stda    %d48, [%i0]ASI_BLK_P
3881         add     %l7, 64, %l7
3882         subcc   %i3, 64, %i3
3883         bz,pn   %ncc, 1f
3884         add     %i0, 64, %i0
3885         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3886         FALIGN_D32
3887         ldda    [%l7]ASI_BLK_AIUS, %d32
3888         stda    %d48, [%i0]ASI_BLK_P
3889         add     %l7, 64, %l7
3890         subcc   %i3, 64, %i3
3891         bz,pn   %ncc, 2f
3892         add     %i0, 64, %i0
3893         ba,a,pt %ncc, copyin_seg0
3894 
3895 0:
3896         FALIGN_D16
3897         stda    %d48, [%i0]ASI_BLK_P
3898         add     %i0, 64, %i0
3899         membar  #Sync
3900         FALIGN_D32
3901         stda    %d48, [%i0]ASI_BLK_P
3902         ba,pt   %ncc, copyin_blkd0
3903         add     %i0, 64, %i0
3904 
3905 1:
3906         FALIGN_D32
3907         stda    %d48, [%i0]ASI_BLK_P
3908         add     %i0, 64, %i0
3909         membar  #Sync
3910         FALIGN_D0
3911         stda    %d48, [%i0]ASI_BLK_P
3912         ba,pt   %ncc, copyin_blkd16
3913         add     %i0, 64, %i0
3914 
3915 2:
3916         FALIGN_D0
3917         stda    %d48, [%i0]ASI_BLK_P
3918         add     %i0, 64, %i0
3919         membar  #Sync
3920         FALIGN_D16
3921         stda    %d48, [%i0]ASI_BLK_P
3922         ba,pt   %ncc, copyin_blkd32
3923         add     %i0, 64, %i0
3924 
3925 copyin_seg1:
3926         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3927         FALIGN_D2
3928         ldda    [%l7]ASI_BLK_AIUS, %d0
3929         stda    %d48, [%i0]ASI_BLK_P
3930         add     %l7, 64, %l7
3931         subcc   %i3, 64, %i3
3932         bz,pn   %ncc, 0f
3933         add     %i0, 64, %i0
3934         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3935         FALIGN_D18
3936         ldda    [%l7]ASI_BLK_AIUS, %d16
3937         stda    %d48, [%i0]ASI_BLK_P
3938         add     %l7, 64, %l7
3939         subcc   %i3, 64, %i3
3940         bz,pn   %ncc, 1f
3941         add     %i0, 64, %i0
3942         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3943         FALIGN_D34
3944         ldda    [%l7]ASI_BLK_AIUS, %d32
3945         stda    %d48, [%i0]ASI_BLK_P
3946         add     %l7, 64, %l7
3947         subcc   %i3, 64, %i3
3948         bz,pn   %ncc, 2f
3949         add     %i0, 64, %i0
3950         ba,a,pt %ncc, copyin_seg1
3951 0:
3952         FALIGN_D18
3953         stda    %d48, [%i0]ASI_BLK_P
3954         add     %i0, 64, %i0
3955         membar  #Sync
3956         FALIGN_D34
3957         stda    %d48, [%i0]ASI_BLK_P
3958         ba,pt   %ncc, copyin_blkd2
3959         add     %i0, 64, %i0
3960 
3961 1:
3962         FALIGN_D34
3963         stda    %d48, [%i0]ASI_BLK_P
3964         add     %i0, 64, %i0
3965         membar  #Sync
3966         FALIGN_D2
3967         stda    %d48, [%i0]ASI_BLK_P
3968         ba,pt   %ncc, copyin_blkd18
3969         add     %i0, 64, %i0
3970 
3971 2:
3972         FALIGN_D2
3973         stda    %d48, [%i0]ASI_BLK_P
3974         add     %i0, 64, %i0
3975         membar  #Sync
3976         FALIGN_D18
3977         stda    %d48, [%i0]ASI_BLK_P
3978         ba,pt   %ncc, copyin_blkd34
3979         add     %i0, 64, %i0
3980 copyin_seg2:
3981         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3982         FALIGN_D4
3983         ldda    [%l7]ASI_BLK_AIUS, %d0
3984         stda    %d48, [%i0]ASI_BLK_P
3985         add     %l7, 64, %l7
3986         subcc   %i3, 64, %i3
3987         bz,pn   %ncc, 0f
3988         add     %i0, 64, %i0
3989         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3990         FALIGN_D20
3991         ldda    [%l7]ASI_BLK_AIUS, %d16
3992         stda    %d48, [%i0]ASI_BLK_P
3993         add     %l7, 64, %l7
3994         subcc   %i3, 64, %i3
3995         bz,pn   %ncc, 1f
3996         add     %i0, 64, %i0
3997         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3998         FALIGN_D36
3999         ldda    [%l7]ASI_BLK_AIUS, %d32
4000         stda    %d48, [%i0]ASI_BLK_P
4001         add     %l7, 64, %l7
4002         subcc   %i3, 64, %i3
4003         bz,pn   %ncc, 2f
4004         add     %i0, 64, %i0
4005         ba,a,pt %ncc, copyin_seg2
4006 
4007 0:
4008         FALIGN_D20
4009         stda    %d48, [%i0]ASI_BLK_P
4010         add     %i0, 64, %i0
4011         membar  #Sync
4012         FALIGN_D36
4013         stda    %d48, [%i0]ASI_BLK_P
4014         ba,pt   %ncc, copyin_blkd4
4015         add     %i0, 64, %i0
4016 
4017 1:
4018         FALIGN_D36
4019         stda    %d48, [%i0]ASI_BLK_P
4020         add     %i0, 64, %i0
4021         membar  #Sync
4022         FALIGN_D4
4023         stda    %d48, [%i0]ASI_BLK_P
4024         ba,pt   %ncc, copyin_blkd20
4025         add     %i0, 64, %i0
4026 
4027 2:
4028         FALIGN_D4
4029         stda    %d48, [%i0]ASI_BLK_P
4030         add     %i0, 64, %i0
4031         membar  #Sync
4032         FALIGN_D20
4033         stda    %d48, [%i0]ASI_BLK_P
4034         ba,pt   %ncc, copyin_blkd36
4035         add     %i0, 64, %i0
4036 
4037 copyin_seg3:
4038         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4039         FALIGN_D6
4040         ldda    [%l7]ASI_BLK_AIUS, %d0
4041         stda    %d48, [%i0]ASI_BLK_P
4042         add     %l7, 64, %l7
4043         subcc   %i3, 64, %i3
4044         bz,pn   %ncc, 0f
4045         add     %i0, 64, %i0
4046         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4047         FALIGN_D22
4048         ldda    [%l7]ASI_BLK_AIUS, %d16
4049         stda    %d48, [%i0]ASI_BLK_P
4050         add     %l7, 64, %l7
4051         subcc   %i3, 64, %i3
4052         bz,pn   %ncc, 1f
4053         add     %i0, 64, %i0
4054         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4055         FALIGN_D38
4056         ldda    [%l7]ASI_BLK_AIUS, %d32
4057         stda    %d48, [%i0]ASI_BLK_P
4058         add     %l7, 64, %l7
4059         subcc   %i3, 64, %i3
4060         bz,pn   %ncc, 2f
4061         add     %i0, 64, %i0
4062         ba,a,pt %ncc, copyin_seg3
4063 
4064 0:
4065         FALIGN_D22
4066         stda    %d48, [%i0]ASI_BLK_P
4067         add     %i0, 64, %i0
4068         membar  #Sync
4069         FALIGN_D38
4070         stda    %d48, [%i0]ASI_BLK_P
4071         ba,pt   %ncc, copyin_blkd6
4072         add     %i0, 64, %i0
4073 
4074 1:
4075         FALIGN_D38
4076         stda    %d48, [%i0]ASI_BLK_P
4077         add     %i0, 64, %i0
4078         membar  #Sync
4079         FALIGN_D6
4080         stda    %d48, [%i0]ASI_BLK_P
4081         ba,pt   %ncc, copyin_blkd22
4082         add     %i0, 64, %i0
4083 
4084 2:
4085         FALIGN_D6
4086         stda    %d48, [%i0]ASI_BLK_P
4087         add     %i0, 64, %i0
4088         membar  #Sync
4089         FALIGN_D22
4090         stda    %d48, [%i0]ASI_BLK_P
4091         ba,pt   %ncc, copyin_blkd38
4092         add     %i0, 64, %i0
4093 
4094 copyin_seg4:
4095         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4096         FALIGN_D8
4097         ldda    [%l7]ASI_BLK_AIUS, %d0
4098         stda    %d48, [%i0]ASI_BLK_P
4099         add     %l7, 64, %l7
4100         subcc   %i3, 64, %i3
4101         bz,pn   %ncc, 0f
4102         add     %i0, 64, %i0
4103         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4104         FALIGN_D24
4105         ldda    [%l7]ASI_BLK_AIUS, %d16
4106         stda    %d48, [%i0]ASI_BLK_P
4107         add     %l7, 64, %l7
4108         subcc   %i3, 64, %i3
4109         bz,pn   %ncc, 1f
4110         add     %i0, 64, %i0
4111         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4112         FALIGN_D40
4113         ldda    [%l7]ASI_BLK_AIUS, %d32
4114         stda    %d48, [%i0]ASI_BLK_P
4115         add     %l7, 64, %l7
4116         subcc   %i3, 64, %i3
4117         bz,pn   %ncc, 2f
4118         add     %i0, 64, %i0
4119         ba,a,pt %ncc, copyin_seg4
4120 
4121 0:
4122         FALIGN_D24
4123         stda    %d48, [%i0]ASI_BLK_P
4124         add     %i0, 64, %i0
4125         membar  #Sync
4126         FALIGN_D40
4127         stda    %d48, [%i0]ASI_BLK_P
4128         ba,pt   %ncc, copyin_blkd8
4129         add     %i0, 64, %i0
4130 
4131 1:
4132         FALIGN_D40
4133         stda    %d48, [%i0]ASI_BLK_P
4134         add     %i0, 64, %i0
4135         membar  #Sync
4136         FALIGN_D8
4137         stda    %d48, [%i0]ASI_BLK_P
4138         ba,pt   %ncc, copyin_blkd24
4139         add     %i0, 64, %i0
4140 
4141 2:
4142         FALIGN_D8
4143         stda    %d48, [%i0]ASI_BLK_P
4144         add     %i0, 64, %i0
4145         membar  #Sync
4146         FALIGN_D24
4147         stda    %d48, [%i0]ASI_BLK_P
4148         ba,pt   %ncc, copyin_blkd40
4149         add     %i0, 64, %i0
4150 
4151 copyin_seg5:
4152         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4153         FALIGN_D10
4154         ldda    [%l7]ASI_BLK_AIUS, %d0
4155         stda    %d48, [%i0]ASI_BLK_P
4156         add     %l7, 64, %l7
4157         subcc   %i3, 64, %i3
4158         bz,pn   %ncc, 0f
4159         add     %i0, 64, %i0
4160         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4161         FALIGN_D26
4162         ldda    [%l7]ASI_BLK_AIUS, %d16
4163         stda    %d48, [%i0]ASI_BLK_P
4164         add     %l7, 64, %l7
4165         subcc   %i3, 64, %i3
4166         bz,pn   %ncc, 1f
4167         add     %i0, 64, %i0
4168         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4169         FALIGN_D42
4170         ldda    [%l7]ASI_BLK_AIUS, %d32
4171         stda    %d48, [%i0]ASI_BLK_P
4172         add     %l7, 64, %l7
4173         subcc   %i3, 64, %i3
4174         bz,pn   %ncc, 2f
4175         add     %i0, 64, %i0
4176         ba,a,pt %ncc, copyin_seg5
4177 
4178 0:
4179         FALIGN_D26
4180         stda    %d48, [%i0]ASI_BLK_P
4181         add     %i0, 64, %i0
4182         membar  #Sync
4183         FALIGN_D42
4184         stda    %d48, [%i0]ASI_BLK_P
4185         ba,pt   %ncc, copyin_blkd10
4186         add     %i0, 64, %i0
4187 
4188 1:
4189         FALIGN_D42
4190         stda    %d48, [%i0]ASI_BLK_P
4191         add     %i0, 64, %i0
4192         membar  #Sync
4193         FALIGN_D10
4194         stda    %d48, [%i0]ASI_BLK_P
4195         ba,pt   %ncc, copyin_blkd26
4196         add     %i0, 64, %i0
4197 
4198 2:
4199         FALIGN_D10
4200         stda    %d48, [%i0]ASI_BLK_P
4201         add     %i0, 64, %i0
4202         membar  #Sync
4203         FALIGN_D26
4204         stda    %d48, [%i0]ASI_BLK_P
4205         ba,pt   %ncc, copyin_blkd42
4206         add     %i0, 64, %i0
4207 
4208 copyin_seg6:
4209         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4210         FALIGN_D12
4211         ldda    [%l7]ASI_BLK_AIUS, %d0
4212         stda    %d48, [%i0]ASI_BLK_P
4213         add     %l7, 64, %l7
4214         subcc   %i3, 64, %i3
4215         bz,pn   %ncc, 0f
4216         add     %i0, 64, %i0
4217         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4218         FALIGN_D28
4219         ldda    [%l7]ASI_BLK_AIUS, %d16
4220         stda    %d48, [%i0]ASI_BLK_P
4221         add     %l7, 64, %l7
4222         subcc   %i3, 64, %i3
4223         bz,pn   %ncc, 1f
4224         add     %i0, 64, %i0
4225         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4226         FALIGN_D44
4227         ldda    [%l7]ASI_BLK_AIUS, %d32
4228         stda    %d48, [%i0]ASI_BLK_P
4229         add     %l7, 64, %l7
4230         subcc   %i3, 64, %i3
4231         bz,pn   %ncc, 2f
4232         add     %i0, 64, %i0
4233         ba,a,pt %ncc, copyin_seg6
4234 
4235 0:
4236         FALIGN_D28
4237         stda    %d48, [%i0]ASI_BLK_P
4238         add     %i0, 64, %i0
4239         membar  #Sync
4240         FALIGN_D44
4241         stda    %d48, [%i0]ASI_BLK_P
4242         ba,pt   %ncc, copyin_blkd12
4243         add     %i0, 64, %i0
4244 
4245 1:
4246         FALIGN_D44
4247         stda    %d48, [%i0]ASI_BLK_P
4248         add     %i0, 64, %i0
4249         membar  #Sync
4250         FALIGN_D12
4251         stda    %d48, [%i0]ASI_BLK_P
4252         ba,pt   %ncc, copyin_blkd28
4253         add     %i0, 64, %i0
4254 
4255 2:
4256         FALIGN_D12
4257         stda    %d48, [%i0]ASI_BLK_P
4258         add     %i0, 64, %i0
4259         membar  #Sync
4260         FALIGN_D28
4261         stda    %d48, [%i0]ASI_BLK_P
4262         ba,pt   %ncc, copyin_blkd44
4263         add     %i0, 64, %i0
4264 
4265 copyin_seg7:
4266         ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4267         FALIGN_D14
4268         ldda    [%l7]ASI_BLK_AIUS, %d0
4269         stda    %d48, [%i0]ASI_BLK_P
4270         add     %l7, 64, %l7
4271         subcc   %i3, 64, %i3
4272         bz,pn   %ncc, 0f
4273         add     %i0, 64, %i0
4274         ! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4275         FALIGN_D30
4276         ldda    [%l7]ASI_BLK_AIUS, %d16
4277         stda    %d48, [%i0]ASI_BLK_P
4278         add     %l7, 64, %l7
4279         subcc   %i3, 64, %i3
4280         bz,pn   %ncc, 1f
4281         add     %i0, 64, %i0
4282         ! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4283         FALIGN_D46
4284         ldda    [%l7]ASI_BLK_AIUS, %d32
4285         stda    %d48, [%i0]ASI_BLK_P
4286         add     %l7, 64, %l7
4287         subcc   %i3, 64, %i3
4288         bz,pn   %ncc, 2f
4289         add     %i0, 64, %i0
4290         ba,a,pt %ncc, copyin_seg7
4291 
4292 0:
4293         FALIGN_D30
4294         stda    %d48, [%i0]ASI_BLK_P
4295         add     %i0, 64, %i0
4296         membar  #Sync
4297         FALIGN_D46
4298         stda    %d48, [%i0]ASI_BLK_P
4299         ba,pt   %ncc, copyin_blkd14
4300         add     %i0, 64, %i0
4301 
4302 1:
4303         FALIGN_D46
4304         stda    %d48, [%i0]ASI_BLK_P
4305         add     %i0, 64, %i0
4306         membar  #Sync
4307         FALIGN_D14
4308         stda    %d48, [%i0]ASI_BLK_P
4309         ba,pt   %ncc, copyin_blkd30
4310         add     %i0, 64, %i0
4311 
4312 2:
4313         FALIGN_D14
4314         stda    %d48, [%i0]ASI_BLK_P
4315         add     %i0, 64, %i0
4316         membar  #Sync
4317         FALIGN_D30
4318         stda    %d48, [%i0]ASI_BLK_P
4319         ba,pt   %ncc, copyin_blkd46
4320         add     %i0, 64, %i0
4321 
4322 
4323         !
4324         ! dribble out the last partial block
4325         !
4326 copyin_blkd0:
4327         subcc   %i4, 8, %i4
4328         blu,pn  %ncc, copyin_blkdone
4329         faligndata %d0, %d2, %d48
4330         std     %d48, [%i0]
4331         add     %i0, 8, %i0
4332 copyin_blkd2:
4333         subcc   %i4, 8, %i4
4334         blu,pn  %ncc, copyin_blkdone
4335         faligndata %d2, %d4, %d48
4336         std     %d48, [%i0]
4337         add     %i0, 8, %i0
4338 copyin_blkd4:
4339         subcc   %i4, 8, %i4
4340         blu,pn  %ncc, copyin_blkdone
4341         faligndata %d4, %d6, %d48
4342         std     %d48, [%i0]
4343         add     %i0, 8, %i0
4344 copyin_blkd6:
4345         subcc   %i4, 8, %i4
4346         blu,pn  %ncc, copyin_blkdone
4347         faligndata %d6, %d8, %d48
4348         std     %d48, [%i0]
4349         add     %i0, 8, %i0
4350 copyin_blkd8:
4351         subcc   %i4, 8, %i4
4352         blu,pn  %ncc, copyin_blkdone
4353         faligndata %d8, %d10, %d48
4354         std     %d48, [%i0]
4355         add     %i0, 8, %i0
4356 copyin_blkd10:
4357         subcc   %i4, 8, %i4
4358         blu,pn  %ncc, copyin_blkdone
4359         faligndata %d10, %d12, %d48
4360         std     %d48, [%i0]
4361         add     %i0, 8, %i0
4362 copyin_blkd12:
4363         subcc   %i4, 8, %i4
4364         blu,pn  %ncc, copyin_blkdone
4365         faligndata %d12, %d14, %d48
4366         std     %d48, [%i0]
4367         add     %i0, 8, %i0
4368 copyin_blkd14:
4369         subcc   %i4, 8, %i4
4370         blu,pn  %ncc, copyin_blkdone
4371         fsrc1   %d14, %d0
4372         ba,a,pt %ncc, copyin_blkleft
4373 
4374 copyin_blkd16:
4375         subcc   %i4, 8, %i4
4376         blu,pn  %ncc, copyin_blkdone
4377         faligndata %d16, %d18, %d48
4378         std     %d48, [%i0]
4379         add     %i0, 8, %i0
4380 copyin_blkd18:
4381         subcc   %i4, 8, %i4
4382         blu,pn  %ncc, copyin_blkdone
4383         faligndata %d18, %d20, %d48
4384         std     %d48, [%i0]
4385         add     %i0, 8, %i0
4386 copyin_blkd20:
4387         subcc   %i4, 8, %i4
4388         blu,pn  %ncc, copyin_blkdone
4389         faligndata %d20, %d22, %d48
4390         std     %d48, [%i0]
4391         add     %i0, 8, %i0
4392 copyin_blkd22:
4393         subcc   %i4, 8, %i4
4394         blu,pn  %ncc, copyin_blkdone
4395         faligndata %d22, %d24, %d48
4396         std     %d48, [%i0]
4397         add     %i0, 8, %i0
4398 copyin_blkd24:
4399         subcc   %i4, 8, %i4
4400         blu,pn  %ncc, copyin_blkdone
4401         faligndata %d24, %d26, %d48
4402         std     %d48, [%i0]
4403         add     %i0, 8, %i0
4404 copyin_blkd26:
4405         subcc   %i4, 8, %i4
4406         blu,pn  %ncc, copyin_blkdone
4407         faligndata %d26, %d28, %d48
4408         std     %d48, [%i0]
4409         add     %i0, 8, %i0
4410 copyin_blkd28:
4411         subcc   %i4, 8, %i4
4412         blu,pn  %ncc, copyin_blkdone
4413         faligndata %d28, %d30, %d48
4414         std     %d48, [%i0]
4415         add     %i0, 8, %i0
4416 copyin_blkd30:
4417         subcc   %i4, 8, %i4
4418         blu,pn  %ncc, copyin_blkdone
4419         fsrc1   %d30, %d0
4420         ba,a,pt %ncc, copyin_blkleft
4421 copyin_blkd32:
4422         subcc   %i4, 8, %i4
4423         blu,pn  %ncc, copyin_blkdone
4424         faligndata %d32, %d34, %d48
4425         std     %d48, [%i0]
4426         add     %i0, 8, %i0
4427 copyin_blkd34:
4428         subcc   %i4, 8, %i4
4429         blu,pn  %ncc, copyin_blkdone
4430         faligndata %d34, %d36, %d48
4431         std     %d48, [%i0]
4432         add     %i0, 8, %i0
4433 copyin_blkd36:
4434         subcc   %i4, 8, %i4
4435         blu,pn  %ncc, copyin_blkdone
4436         faligndata %d36, %d38, %d48
4437         std     %d48, [%i0]
4438         add     %i0, 8, %i0
4439 copyin_blkd38:
4440         subcc   %i4, 8, %i4
4441         blu,pn  %ncc, copyin_blkdone
4442         faligndata %d38, %d40, %d48
4443         std     %d48, [%i0]
4444         add     %i0, 8, %i0
4445 copyin_blkd40:
4446         subcc   %i4, 8, %i4
4447         blu,pn  %ncc, copyin_blkdone
4448         faligndata %d40, %d42, %d48
4449         std     %d48, [%i0]
4450         add     %i0, 8, %i0
4451 copyin_blkd42:
4452         subcc   %i4, 8, %i4
4453         blu,pn  %ncc, copyin_blkdone
4454         faligndata %d42, %d44, %d48
4455         std     %d48, [%i0]
4456         add     %i0, 8, %i0
4457 copyin_blkd44:
4458         subcc   %i4, 8, %i4
4459         blu,pn  %ncc, copyin_blkdone
4460         faligndata %d44, %d46, %d48
4461         std     %d48, [%i0]
4462         add     %i0, 8, %i0
4463 copyin_blkd46:
4464         subcc   %i4, 8, %i4
4465         blu,pn  %ncc, copyin_blkdone
4466         fsrc1   %d46, %d0
4467 
4468 copyin_blkleft:
4469 1:
4470         ldda    [%l7]ASI_USER, %d2
4471         add     %l7, 8, %l7
4472         subcc   %i4, 8, %i4
4473         faligndata %d0, %d2, %d8
4474         std     %d8, [%i0]
4475         blu,pn  %ncc, copyin_blkdone
4476         add     %i0, 8, %i0
4477         ldda    [%l7]ASI_USER, %d0
4478         add     %l7, 8, %l7
4479         subcc   %i4, 8, %i4
4480         faligndata %d2, %d0, %d8
4481         std     %d8, [%i0]
4482         bgeu,pt %ncc, 1b
4483         add     %i0, 8, %i0
4484 
4485 copyin_blkdone:
4486         tst     %i2
4487         bz,pt   %ncc, .copyin_exit
4488         and     %l3, 0x4, %l3           ! fprs.du = fprs.dl = 0
4489 
4490 7:      lduba   [%i1]ASI_USER, %i4
4491         inc     %i1
4492         inc     %i0
4493         deccc   %i2
4494         bgu     %ncc, 7b
4495           stb     %i4, [%i0 - 1]
4496 
4497 .copyin_exit:
4498         membar  #StoreLoad|#StoreStore
4499         btst    FPUSED_FLAG, SAVED_LOFAULT
4500         bz      %icc, 1f
4501           nop
4502 
4503         ld      [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
4504         wr      %o2, 0, %gsr
4505 
4506         ld      [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
4507         btst    FPRS_FEF, %o3
4508         bz      %icc, 4f
4509           nop
4510 
4511         ! restore fpregs from stack
4512         membar  #Sync
4513         add     %fp, STACK_BIAS - 257, %o2
4514         and     %o2, -64, %o2
4515         ldda    [%o2]ASI_BLK_P, %d0
4516         add     %o2, 64, %o2
4517         ldda    [%o2]ASI_BLK_P, %d16
4518         add     %o2, 64, %o2
4519         ldda    [%o2]ASI_BLK_P, %d32
4520         add     %o2, 64, %o2
4521         ldda    [%o2]ASI_BLK_P, %d48
4522         membar  #Sync
4523 
4524         ba,pt   %ncc, 1f
4525           wr    %o3, 0, %fprs           ! restore fprs
4526 
4527 4:
4528         FZERO                           ! zero all of the fpregs
4529         wr      %o3, 0, %fprs           ! restore fprs
4530 
4531 1:
4532         andn    SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4533         membar  #Sync                           ! sync error barrier
4534         stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault
4535         ret
4536         restore %g0, 0, %o0
4537 .copyin_err:
4538         ldn     [THREAD_REG + T_COPYOPS], %o4
4539         brz     %o4, 2f
4540         nop
4541         ldn     [%o4 + CP_COPYIN], %g2
4542         jmp     %g2
4543         nop
4544 2:
4545         retl
4546         mov     -1, %o0
4547         SET_SIZE(copyin)
4548 
4549         ENTRY(xcopyin)
4550         sethi   %hi(.xcopyin_err), REAL_LOFAULT
4551         b       .do_copyin
4552           or    REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
4553 .xcopyin_err:
4554         ldn     [THREAD_REG + T_COPYOPS], %o4
4555         brz     %o4, 2f
4556         nop
4557         ldn     [%o4 + CP_XCOPYIN], %g2
4558         jmp     %g2
4559         nop
4560 2:
4561         retl
4562         mov     %g1, %o0
4563         SET_SIZE(xcopyin)
4564 
4565         ENTRY(xcopyin_little)
4566         sethi   %hi(.little_err), %o4
4567         ldn     [THREAD_REG + T_LOFAULT], %o5
4568         or      %o4, %lo(.little_err), %o4
4569         membar  #Sync                           ! sync error barrier
4570         stn     %o4, [THREAD_REG + T_LOFAULT]   
4571 
4572         subcc   %g0, %o2, %o3
4573         add     %o0, %o2, %o0
4574         bz,pn   %ncc, 2f                ! check for zero bytes
4575         sub     %o2, 1, %o4
4576         add     %o0, %o4, %o0           ! start w/last byte     
4577         add     %o1, %o2, %o1
4578         lduba   [%o0+%o3]ASI_AIUSL, %o4
4579 
4580 1:      stb     %o4, [%o1+%o3]
4581         inccc   %o3
4582         sub     %o0, 2, %o0             ! get next byte
4583         bcc,a,pt %ncc, 1b
4584           lduba [%o0+%o3]ASI_AIUSL, %o4
4585 
4586 2:      membar  #Sync                           ! sync error barrier
4587         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
4588         retl
4589         mov     %g0, %o0                ! return (0)
4590 
4591 .little_err:
4592         membar  #Sync                           ! sync error barrier
4593         stn     %o5, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
4594         retl
4595         mov     %g1, %o0
4596         SET_SIZE(xcopyin_little)
4597 
4598 
4599 /*
4600  * Copy a block of storage - must not overlap (from + len <= to).
4601  * No fault handler installed (to be called under on_fault())
4602  */
4603 
4604         ENTRY(copyin_noerr)
4605         sethi   %hi(.copyio_noerr), REAL_LOFAULT
4606         b       .do_copyin
4607           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4608 .copyio_noerr:
4609         jmp     SAVED_LOFAULT
4610           nop
4611         SET_SIZE(copyin_noerr)
4612 
4613 /*
4614  * Copy a block of storage - must not overlap (from + len <= to).
4615  * No fault handler installed (to be called under on_fault())
4616  */
4617 
4618         ENTRY(copyout_noerr)
4619         sethi   %hi(.copyio_noerr), REAL_LOFAULT
4620         b       .do_copyout
4621           or    REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4622         SET_SIZE(copyout_noerr)
4623 
4624         .align  4
4625         DGDEF(use_hw_bcopy)
4626         .word   1
4627         DGDEF(use_hw_copyio)
4628         .word   1
4629         DGDEF(use_hw_bzero)
4630         .word   1
4631         DGDEF(hw_copy_limit_1)
4632         .word   0
4633         DGDEF(hw_copy_limit_2)
4634         .word   0
4635         DGDEF(hw_copy_limit_4)
4636         .word   0
4637         DGDEF(hw_copy_limit_8)
4638         .word   0
4639 
4640         .align  64
4641         .section ".text"
4642 
4643 
4644 /*
4645  * hwblkclr - clears block-aligned, block-multiple-sized regions that are
4646  * longer than 256 bytes in length using spitfire's block stores.  If
4647  * the criteria for using this routine are not met then it calls bzero
4648  * and returns 1.  Otherwise 0 is returned indicating success.
4649  * Caller is responsible for ensuring use_hw_bzero is true and that
4650  * kpreempt_disable() has been called.
4651  */
4652         ! %i0 - start address
4653         ! %i1 - length of region (multiple of 64)
4654         ! %l0 - saved fprs
4655         ! %l1 - pointer to saved %d0 block
4656         ! %l2 - saved curthread->t_lwp
4657 
4658         ENTRY(hwblkclr)
4659         ! get another window w/space for one aligned block of saved fpregs
4660         save    %sp, -SA(MINFRAME + 2*64), %sp
4661 
4662         ! Must be block-aligned
4663         andcc   %i0, (64-1), %g0
4664         bnz,pn  %ncc, 1f
4665           nop
4666 
4667         ! ... and must be 256 bytes or more
4668         cmp     %i1, 256
4669         blu,pn  %ncc, 1f
4670           nop
4671 
4672         ! ... and length must be a multiple of 64
4673         andcc   %i1, (64-1), %g0
4674         bz,pn   %ncc, 2f
4675           nop
4676 
4677 1:      ! punt, call bzero but notify the caller that bzero was used
4678         mov     %i0, %o0
4679         call    bzero
4680           mov   %i1, %o1
4681         ret
4682         restore %g0, 1, %o0     ! return (1) - did not use block operations
4683 
4684 2:      rd      %fprs, %l0              ! check for unused fp
4685         btst    FPRS_FEF, %l0
4686         bz      1f
4687           nop
4688 
4689         ! save in-use fpregs on stack
4690         membar  #Sync
4691         add     %fp, STACK_BIAS - 65, %l1
4692         and     %l1, -64, %l1
4693         stda    %d0, [%l1]ASI_BLK_P
4694 
4695 1:      membar  #StoreStore|#StoreLoad|#LoadStore
4696         wr      %g0, FPRS_FEF, %fprs
4697         wr      %g0, ASI_BLK_P, %asi
4698 
4699         ! Clear block
4700         fzero   %d0
4701         fzero   %d2
4702         fzero   %d4
4703         fzero   %d6
4704         fzero   %d8
4705         fzero   %d10
4706         fzero   %d12
4707         fzero   %d14
4708 
4709         mov     256, %i3
4710         ba      .pz_doblock
4711           nop
4712 
4713 .pz_blkstart:   
4714       ! stda    %d0, [%i0+192]%asi  ! in dly slot of branch that got us here
4715         stda    %d0, [%i0+128]%asi
4716         stda    %d0, [%i0+64]%asi
4717         stda    %d0, [%i0]%asi
4718 .pz_zinst:
4719         add     %i0, %i3, %i0
4720         sub     %i1, %i3, %i1
4721 .pz_doblock:
4722         cmp     %i1, 256
4723         bgeu,a  %ncc, .pz_blkstart
4724           stda  %d0, [%i0+192]%asi
4725 
4726         cmp     %i1, 64
4727         blu     %ncc, .pz_finish
4728         
4729         andn    %i1, (64-1), %i3
4730         srl     %i3, 4, %i2             ! using blocks, 1 instr / 16 words
4731         set     .pz_zinst, %i4
4732         sub     %i4, %i2, %i4
4733         jmp     %i4
4734           nop
4735 
4736 .pz_finish:
4737         membar  #Sync
4738         btst    FPRS_FEF, %l0
4739         bz,a    .pz_finished
4740           wr    %l0, 0, %fprs           ! restore fprs
4741 
4742         ! restore fpregs from stack
4743         ldda    [%l1]ASI_BLK_P, %d0
4744         membar  #Sync
4745         wr      %l0, 0, %fprs           ! restore fprs
4746 
4747 .pz_finished:
4748         ret
4749         restore %g0, 0, %o0             ! return (bzero or not)
4750         SET_SIZE(hwblkclr)
4751 
4752         /*
4753          * Copy 32 bytes of data from src (%o0) to dst (%o1)
4754          * using physical addresses.
4755          */
4756         ENTRY_NP(hw_pa_bcopy32)
4757         rdpr    %pstate, %g1
4758         andn    %g1, PSTATE_IE, %g2
4759         wrpr    %g0, %g2, %pstate
4760 
4761         ldxa    [%o0]ASI_MEM, %o2
4762         add     %o0, 8, %o0
4763         ldxa    [%o0]ASI_MEM, %o3
4764         add     %o0, 8, %o0
4765         ldxa    [%o0]ASI_MEM, %o4
4766         add     %o0, 8, %o0
4767         ldxa    [%o0]ASI_MEM, %o5
4768         stxa    %o2, [%o1]ASI_MEM
4769         add     %o1, 8, %o1
4770         stxa    %o3, [%o1]ASI_MEM
4771         add     %o1, 8, %o1
4772         stxa    %o4, [%o1]ASI_MEM
4773         add     %o1, 8, %o1
4774         stxa    %o5, [%o1]ASI_MEM
4775 
4776         membar  #Sync
4777         retl
4778           wrpr    %g0, %g1, %pstate
4779         SET_SIZE(hw_pa_bcopy32)