1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2009, Intel Corporation
  28  * All rights reserved.
  29  */
  30 
  31 /*       Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.        */
  32 /*       Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T              */
  33 /*         All Rights Reserved                                          */
  34 
  35 /*       Copyright (c) 1987, 1988 Microsoft Corporation                 */
  36 /*         All Rights Reserved                                          */
  37 
  38 /*
  39  * Copyright 2019 Joyent, Inc.
  40  */
  41 
  42 #include <sys/errno.h>
  43 #include <sys/asm_linkage.h>
  44 
  45 #include "assym.h"
  46 
  47 #define KCOPY_MIN_SIZE  128     /* Must be >= 16 bytes */
  48 #define XCOPY_MIN_SIZE  128     /* Must be >= 16 bytes */
  49 /*
  50  * Non-temopral access (NTA) alignment requirement
  51  */
  52 #define NTA_ALIGN_SIZE  4       /* Must be at least 4-byte aligned */
  53 #define NTA_ALIGN_MASK  _CONST(NTA_ALIGN_SIZE-1)
  54 #define COUNT_ALIGN_SIZE        16      /* Must be at least 16-byte aligned */
  55 #define COUNT_ALIGN_MASK        _CONST(COUNT_ALIGN_SIZE-1)
  56 
  57 /*
  58  * With the introduction of Broadwell, Intel has introduced supervisor mode
  59  * access protection -- SMAP. SMAP forces the kernel to set certain bits to
  60  * enable access of user pages (AC in rflags, defines as PS_ACHK in
  61  * <sys/psw.h>). One of the challenges is that the implementation of many of the
  62  * userland copy routines directly use the kernel ones. For example, copyin and
  63  * copyout simply go and jump to the do_copy_fault label and traditionally let
  64  * those deal with the return for them. In fact, changing that is a can of frame
  65  * pointers.
  66  *
  67  * Rules and Constraints:
  68  *
  69  * 1. For anything that's not in copy.s, we have it do explicit calls to the
  70  * smap related code. It usually is in a position where it is able to. This is
  71  * restricted to the following three places: DTrace, resume() in swtch.s and
  72  * on_fault/no_fault. If you want to add it somewhere else, we should be
  73  * thinking twice.
  74  *
  75  * 2. We try to toggle this at the smallest window possible. This means that if
  76  * we take a fault, need to try to use a copyop in copyin() or copyout(), or any
  77  * other function, we will always leave with SMAP enabled (the kernel cannot
  78  * access user pages).
  79  *
  80  * 3. None of the *_noerr() or ucopy/uzero routines should toggle SMAP. They are
  81  * explicitly only allowed to be called while in an on_fault()/no_fault() handler,
  82  * which already takes care of ensuring that SMAP is enabled and disabled. Note
  83  * this means that when under an on_fault()/no_fault() handler, one must not
  84  * call the non-*_noeer() routines.
  85  *
  86  * 4. The first thing we should do after coming out of an lofault handler is to
  87  * make sure that we call smap_enable again to ensure that we are safely
  88  * protected, as more often than not, we will have disabled smap to get there.
  89  *
  90  * 5. The SMAP functions, smap_enable and smap_disable may not touch any
  91  * registers beyond those done by the call and ret. These routines may be called
  92  * from arbitrary contexts in copy.s where we have slightly more special ABIs in
  93  * place.
  94  *
  95  * 6. For any inline user of SMAP, the appropriate SMAP_ENABLE_INSTR and
  96  * SMAP_DISABLE_INSTR macro should be used (except for smap_enable() and
  97  * smap_disable()). If the number of these is changed, you must update the
  98  * constants SMAP_ENABLE_COUNT and SMAP_DISABLE_COUNT below.
  99  *
 100  * 7. Note, at this time SMAP is not implemented for the 32-bit kernel. There is
 101  * no known technical reason preventing it from being enabled.
 102  *
 103  * 8. Generally this .s file is processed by a K&R style cpp. This means that it
 104  * really has a lot of feelings about whitespace. In particular, if you have a
 105  * macro FOO with the arguments FOO(1, 3), the second argument is in fact ' 3'.
 106  *
 107  * 9. The smap_enable and smap_disable functions should not generally be called.
 108  * They exist such that DTrace and on_trap() may use them, that's it.
 109  *
 110  * 10. In general, the kernel has its own value for rflags that gets used. This
 111  * is maintained in a few different places which vary based on how the thread
 112  * comes into existence and whether it's a user thread. In general, when the
 113  * kernel takes a trap, it always will set ourselves to a known set of flags,
 114  * mainly as part of ENABLE_INTR_FLAGS and F_OFF and F_ON. These ensure that
 115  * PS_ACHK is cleared for us. In addition, when using the sysenter instruction,
 116  * we mask off PS_ACHK off via the AMD_SFMASK MSR. See init_cpu_syscall() for
 117  * where that gets masked off.
 118  */
 119 
 120 /*
 121  * The optimal 64-bit bcopy and kcopy for modern x86 processors uses
 122  * "rep smovq" for large sizes. Performance data shows that many calls to
 123  * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for
 124  * these small sizes unrolled code is used. For medium sizes loops writing
 125  * 64-bytes per loop are used. Transition points were determined experimentally.
 126  */
 127 #define BZERO_USE_REP   (1024)
 128 #define BCOPY_DFLT_REP  (128)
 129 #define BCOPY_NHM_REP   (768)
 130 
 131 /*
 132  * Copy a block of storage, returning an error code if `from' or
 133  * `to' takes a kernel pagefault which cannot be resolved.
 134  * Returns errno value on pagefault error, 0 if all ok
 135  */
 136 
 137 /*
 138  * I'm sorry about these macros, but copy.s is unsurprisingly sensitive to
 139  * additional call instructions.
 140  */
 141 #define SMAP_DISABLE_COUNT      16
 142 #define SMAP_ENABLE_COUNT       26
 143 
 144 #define SMAP_DISABLE_INSTR(ITER)                \
 145         .globl  _smap_disable_patch_/**/ITER;   \
 146         _smap_disable_patch_/**/ITER/**/:;      \
 147         nop; nop; nop;
 148 
 149 #define SMAP_ENABLE_INSTR(ITER)                 \
 150         .globl  _smap_enable_patch_/**/ITER;    \
 151         _smap_enable_patch_/**/ITER/**/:;       \
 152         nop; nop; nop;
 153 
 154         .globl  kernelbase
 155         .globl  postbootkernelbase
 156 
 157         ENTRY(kcopy)
 158         pushq   %rbp
 159         movq    %rsp, %rbp
 160 #ifdef DEBUG
 161         cmpq    postbootkernelbase(%rip), %rdi          /* %rdi = from */
 162         jb      0f
 163         cmpq    postbootkernelbase(%rip), %rsi          /* %rsi = to */
 164         jnb     1f
 165 0:      leaq    .kcopy_panic_msg(%rip), %rdi
 166         xorl    %eax, %eax
 167         call    panic
 168 1:
 169 #endif
 170         /*
 171          * pass lofault value as 4th argument to do_copy_fault
 172          */
 173         leaq    _kcopy_copyerr(%rip), %rcx
 174         movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
 175 
 176 do_copy_fault:
 177         movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
 178         movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
 179         call    bcopy_altentry
 180         xorl    %eax, %eax              /* return 0 (success) */
 181         SMAP_ENABLE_INSTR(0)
 182 
 183         /*
 184          * A fault during do_copy_fault is indicated through an errno value
 185          * in %rax and we iretq from the trap handler to here.
 186          */
 187 _kcopy_copyerr:
 188         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
 189         leave
 190         ret
 191         SET_SIZE(kcopy)
 192 
 193 #undef  ARG_FROM
 194 #undef  ARG_TO
 195 #undef  ARG_COUNT
 196 
 197 #define COPY_LOOP_INIT(src, dst, cnt)   \
 198         addq    cnt, src;                       \
 199         addq    cnt, dst;                       \
 200         shrq    $3, cnt;                        \
 201         neg     cnt
 202 
 203         /* Copy 16 bytes per loop.  Uses %rax and %r8 */
 204 #define COPY_LOOP_BODY(src, dst, cnt)   \
 205         prefetchnta     0x100(src, cnt, 8);     \
 206         movq    (src, cnt, 8), %rax;            \
 207         movq    0x8(src, cnt, 8), %r8;          \
 208         movnti  %rax, (dst, cnt, 8);            \
 209         movnti  %r8, 0x8(dst, cnt, 8);          \
 210         addq    $2, cnt
 211 
 212         ENTRY(kcopy_nta)
 213         pushq   %rbp
 214         movq    %rsp, %rbp
 215 #ifdef DEBUG
 216         cmpq    postbootkernelbase(%rip), %rdi          /* %rdi = from */
 217         jb      0f
 218         cmpq    postbootkernelbase(%rip), %rsi          /* %rsi = to */
 219         jnb     1f
 220 0:      leaq    .kcopy_panic_msg(%rip), %rdi
 221         xorl    %eax, %eax
 222         call    panic
 223 1:
 224 #endif
 225 
 226         movq    %gs:CPU_THREAD, %r9
 227         cmpq    $0, %rcx                /* No non-temporal access? */
 228         /*
 229          * pass lofault value as 4th argument to do_copy_fault
 230          */
 231         leaq    _kcopy_nta_copyerr(%rip), %rcx  /* doesn't set rflags */
 232         jnz     do_copy_fault           /* use regular access */
 233         /*
 234          * Make sure cnt is >= KCOPY_MIN_SIZE
 235          */
 236         cmpq    $KCOPY_MIN_SIZE, %rdx
 237         jb      do_copy_fault
 238 
 239         /*
 240          * Make sure src and dst are NTA_ALIGN_SIZE aligned,
 241          * count is COUNT_ALIGN_SIZE aligned.
 242          */
 243         movq    %rdi, %r10
 244         orq     %rsi, %r10
 245         andq    $NTA_ALIGN_MASK, %r10
 246         orq     %rdx, %r10
 247         andq    $COUNT_ALIGN_MASK, %r10
 248         jnz     do_copy_fault
 249 
 250         ALTENTRY(do_copy_fault_nta)
 251         movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
 252         movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
 253         movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
 254 
 255         /*
 256          * COPY_LOOP_BODY uses %rax and %r8
 257          */
 258         COPY_LOOP_INIT(%rdi, %rsi, %rdx)
 259 2:      COPY_LOOP_BODY(%rdi, %rsi, %rdx)
 260         jnz     2b
 261 
 262         mfence
 263         xorl    %eax, %eax              /* return 0 (success) */
 264         SMAP_ENABLE_INSTR(1)
 265 
 266 _kcopy_nta_copyerr:
 267         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
 268         leave
 269         ret
 270         SET_SIZE(do_copy_fault_nta)
 271         SET_SIZE(kcopy_nta)
 272 
 273         ENTRY(bcopy)
 274 #ifdef DEBUG
 275         orq     %rdx, %rdx              /* %rdx = count */
 276         jz      1f
 277         cmpq    postbootkernelbase(%rip), %rdi          /* %rdi = from */
 278         jb      0f
 279         cmpq    postbootkernelbase(%rip), %rsi          /* %rsi = to */
 280         jnb     1f
 281 0:      leaq    .bcopy_panic_msg(%rip), %rdi
 282         jmp     call_panic              /* setup stack and call panic */
 283 1:
 284 #endif
 285         /*
 286          * bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
 287          * kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
 288          * uses these registers in future they must be saved and restored.
 289          */
 290         ALTENTRY(bcopy_altentry)
 291 do_copy:
 292 #define L(s) .bcopy/**/s
 293         cmpq    $0x50, %rdx             /* 80 */
 294         jae     bcopy_ck_size
 295 
 296         /*
 297          * Performance data shows many caller's copy small buffers. So for
 298          * best perf for these sizes unrolled code is used. Store data without
 299          * worrying about alignment.
 300          */
 301         leaq    L(fwdPxQx)(%rip), %r10
 302         addq    %rdx, %rdi
 303         addq    %rdx, %rsi
 304         movslq  (%r10,%rdx,4), %rcx
 305         leaq    (%rcx,%r10,1), %r10
 306         INDIRECT_JMP_REG(r10)
 307 
 308         .p2align 4
 309 L(fwdPxQx):
 310         .int       L(P0Q0)-L(fwdPxQx)   /* 0 */
 311         .int       L(P1Q0)-L(fwdPxQx)
 312         .int       L(P2Q0)-L(fwdPxQx)
 313         .int       L(P3Q0)-L(fwdPxQx)
 314         .int       L(P4Q0)-L(fwdPxQx)
 315         .int       L(P5Q0)-L(fwdPxQx)
 316         .int       L(P6Q0)-L(fwdPxQx)
 317         .int       L(P7Q0)-L(fwdPxQx)
 318 
 319         .int       L(P0Q1)-L(fwdPxQx)   /* 8 */
 320         .int       L(P1Q1)-L(fwdPxQx)
 321         .int       L(P2Q1)-L(fwdPxQx)
 322         .int       L(P3Q1)-L(fwdPxQx)
 323         .int       L(P4Q1)-L(fwdPxQx)
 324         .int       L(P5Q1)-L(fwdPxQx)
 325         .int       L(P6Q1)-L(fwdPxQx)
 326         .int       L(P7Q1)-L(fwdPxQx)
 327 
 328         .int       L(P0Q2)-L(fwdPxQx)   /* 16 */
 329         .int       L(P1Q2)-L(fwdPxQx)
 330         .int       L(P2Q2)-L(fwdPxQx)
 331         .int       L(P3Q2)-L(fwdPxQx)
 332         .int       L(P4Q2)-L(fwdPxQx)
 333         .int       L(P5Q2)-L(fwdPxQx)
 334         .int       L(P6Q2)-L(fwdPxQx)
 335         .int       L(P7Q2)-L(fwdPxQx)
 336 
 337         .int       L(P0Q3)-L(fwdPxQx)   /* 24 */
 338         .int       L(P1Q3)-L(fwdPxQx)
 339         .int       L(P2Q3)-L(fwdPxQx)
 340         .int       L(P3Q3)-L(fwdPxQx)
 341         .int       L(P4Q3)-L(fwdPxQx)
 342         .int       L(P5Q3)-L(fwdPxQx)
 343         .int       L(P6Q3)-L(fwdPxQx)
 344         .int       L(P7Q3)-L(fwdPxQx)
 345 
 346         .int       L(P0Q4)-L(fwdPxQx)   /* 32 */
 347         .int       L(P1Q4)-L(fwdPxQx)
 348         .int       L(P2Q4)-L(fwdPxQx)
 349         .int       L(P3Q4)-L(fwdPxQx)
 350         .int       L(P4Q4)-L(fwdPxQx)
 351         .int       L(P5Q4)-L(fwdPxQx)
 352         .int       L(P6Q4)-L(fwdPxQx)
 353         .int       L(P7Q4)-L(fwdPxQx)
 354 
 355         .int       L(P0Q5)-L(fwdPxQx)   /* 40 */
 356         .int       L(P1Q5)-L(fwdPxQx)
 357         .int       L(P2Q5)-L(fwdPxQx)
 358         .int       L(P3Q5)-L(fwdPxQx)
 359         .int       L(P4Q5)-L(fwdPxQx)
 360         .int       L(P5Q5)-L(fwdPxQx)
 361         .int       L(P6Q5)-L(fwdPxQx)
 362         .int       L(P7Q5)-L(fwdPxQx)
 363 
 364         .int       L(P0Q6)-L(fwdPxQx)   /* 48 */
 365         .int       L(P1Q6)-L(fwdPxQx)
 366         .int       L(P2Q6)-L(fwdPxQx)
 367         .int       L(P3Q6)-L(fwdPxQx)
 368         .int       L(P4Q6)-L(fwdPxQx)
 369         .int       L(P5Q6)-L(fwdPxQx)
 370         .int       L(P6Q6)-L(fwdPxQx)
 371         .int       L(P7Q6)-L(fwdPxQx)
 372 
 373         .int       L(P0Q7)-L(fwdPxQx)   /* 56 */
 374         .int       L(P1Q7)-L(fwdPxQx)
 375         .int       L(P2Q7)-L(fwdPxQx)
 376         .int       L(P3Q7)-L(fwdPxQx)
 377         .int       L(P4Q7)-L(fwdPxQx)
 378         .int       L(P5Q7)-L(fwdPxQx)
 379         .int       L(P6Q7)-L(fwdPxQx)
 380         .int       L(P7Q7)-L(fwdPxQx)
 381 
 382         .int       L(P0Q8)-L(fwdPxQx)   /* 64 */
 383         .int       L(P1Q8)-L(fwdPxQx)
 384         .int       L(P2Q8)-L(fwdPxQx)
 385         .int       L(P3Q8)-L(fwdPxQx)
 386         .int       L(P4Q8)-L(fwdPxQx)
 387         .int       L(P5Q8)-L(fwdPxQx)
 388         .int       L(P6Q8)-L(fwdPxQx)
 389         .int       L(P7Q8)-L(fwdPxQx)
 390 
 391         .int       L(P0Q9)-L(fwdPxQx)   /* 72 */
 392         .int       L(P1Q9)-L(fwdPxQx)
 393         .int       L(P2Q9)-L(fwdPxQx)
 394         .int       L(P3Q9)-L(fwdPxQx)
 395         .int       L(P4Q9)-L(fwdPxQx)
 396         .int       L(P5Q9)-L(fwdPxQx)
 397         .int       L(P6Q9)-L(fwdPxQx)
 398         .int       L(P7Q9)-L(fwdPxQx)   /* 79 */
 399 
 400         .p2align 4
 401 L(P0Q9):
 402         mov    -0x48(%rdi), %rcx
 403         mov    %rcx, -0x48(%rsi)
 404 L(P0Q8):
 405         mov    -0x40(%rdi), %r10
 406         mov    %r10, -0x40(%rsi)
 407 L(P0Q7):
 408         mov    -0x38(%rdi), %r8
 409         mov    %r8, -0x38(%rsi)
 410 L(P0Q6):
 411         mov    -0x30(%rdi), %rcx
 412         mov    %rcx, -0x30(%rsi)
 413 L(P0Q5):
 414         mov    -0x28(%rdi), %r10
 415         mov    %r10, -0x28(%rsi)
 416 L(P0Q4):
 417         mov    -0x20(%rdi), %r8
 418         mov    %r8, -0x20(%rsi)
 419 L(P0Q3):
 420         mov    -0x18(%rdi), %rcx
 421         mov    %rcx, -0x18(%rsi)
 422 L(P0Q2):
 423         mov    -0x10(%rdi), %r10
 424         mov    %r10, -0x10(%rsi)
 425 L(P0Q1):
 426         mov    -0x8(%rdi), %r8
 427         mov    %r8, -0x8(%rsi)
 428 L(P0Q0):
 429         ret
 430 
 431         .p2align 4
 432 L(P1Q9):
 433         mov    -0x49(%rdi), %r8
 434         mov    %r8, -0x49(%rsi)
 435 L(P1Q8):
 436         mov    -0x41(%rdi), %rcx
 437         mov    %rcx, -0x41(%rsi)
 438 L(P1Q7):
 439         mov    -0x39(%rdi), %r10
 440         mov    %r10, -0x39(%rsi)
 441 L(P1Q6):
 442         mov    -0x31(%rdi), %r8
 443         mov    %r8, -0x31(%rsi)
 444 L(P1Q5):
 445         mov    -0x29(%rdi), %rcx
 446         mov    %rcx, -0x29(%rsi)
 447 L(P1Q4):
 448         mov    -0x21(%rdi), %r10
 449         mov    %r10, -0x21(%rsi)
 450 L(P1Q3):
 451         mov    -0x19(%rdi), %r8
 452         mov    %r8, -0x19(%rsi)
 453 L(P1Q2):
 454         mov    -0x11(%rdi), %rcx
 455         mov    %rcx, -0x11(%rsi)
 456 L(P1Q1):
 457         mov    -0x9(%rdi), %r10
 458         mov    %r10, -0x9(%rsi)
 459 L(P1Q0):
 460         movzbq -0x1(%rdi), %r8
 461         mov    %r8b, -0x1(%rsi)
 462         ret
 463 
 464         .p2align 4
 465 L(P2Q9):
 466         mov    -0x4a(%rdi), %r8
 467         mov    %r8, -0x4a(%rsi)
 468 L(P2Q8):
 469         mov    -0x42(%rdi), %rcx
 470         mov    %rcx, -0x42(%rsi)
 471 L(P2Q7):
 472         mov    -0x3a(%rdi), %r10
 473         mov    %r10, -0x3a(%rsi)
 474 L(P2Q6):
 475         mov    -0x32(%rdi), %r8
 476         mov    %r8, -0x32(%rsi)
 477 L(P2Q5):
 478         mov    -0x2a(%rdi), %rcx
 479         mov    %rcx, -0x2a(%rsi)
 480 L(P2Q4):
 481         mov    -0x22(%rdi), %r10
 482         mov    %r10, -0x22(%rsi)
 483 L(P2Q3):
 484         mov    -0x1a(%rdi), %r8
 485         mov    %r8, -0x1a(%rsi)
 486 L(P2Q2):
 487         mov    -0x12(%rdi), %rcx
 488         mov    %rcx, -0x12(%rsi)
 489 L(P2Q1):
 490         mov    -0xa(%rdi), %r10
 491         mov    %r10, -0xa(%rsi)
 492 L(P2Q0):
 493         movzwq -0x2(%rdi), %r8
 494         mov    %r8w, -0x2(%rsi)
 495         ret
 496 
 497         .p2align 4
 498 L(P3Q9):
 499         mov    -0x4b(%rdi), %r8
 500         mov    %r8, -0x4b(%rsi)
 501 L(P3Q8):
 502         mov    -0x43(%rdi), %rcx
 503         mov    %rcx, -0x43(%rsi)
 504 L(P3Q7):
 505         mov    -0x3b(%rdi), %r10
 506         mov    %r10, -0x3b(%rsi)
 507 L(P3Q6):
 508         mov    -0x33(%rdi), %r8
 509         mov    %r8, -0x33(%rsi)
 510 L(P3Q5):
 511         mov    -0x2b(%rdi), %rcx
 512         mov    %rcx, -0x2b(%rsi)
 513 L(P3Q4):
 514         mov    -0x23(%rdi), %r10
 515         mov    %r10, -0x23(%rsi)
 516 L(P3Q3):
 517         mov    -0x1b(%rdi), %r8
 518         mov    %r8, -0x1b(%rsi)
 519 L(P3Q2):
 520         mov    -0x13(%rdi), %rcx
 521         mov    %rcx, -0x13(%rsi)
 522 L(P3Q1):
 523         mov    -0xb(%rdi), %r10
 524         mov    %r10, -0xb(%rsi)
 525         /*
 526          * These trailing loads/stores have to do all their loads 1st,
 527          * then do the stores.
 528          */
 529 L(P3Q0):
 530         movzwq -0x3(%rdi), %r8
 531         movzbq -0x1(%rdi), %r10
 532         mov    %r8w, -0x3(%rsi)
 533         mov    %r10b, -0x1(%rsi)
 534         ret
 535 
 536         .p2align 4
 537 L(P4Q9):
 538         mov    -0x4c(%rdi), %r8
 539         mov    %r8, -0x4c(%rsi)
 540 L(P4Q8):
 541         mov    -0x44(%rdi), %rcx
 542         mov    %rcx, -0x44(%rsi)
 543 L(P4Q7):
 544         mov    -0x3c(%rdi), %r10
 545         mov    %r10, -0x3c(%rsi)
 546 L(P4Q6):
 547         mov    -0x34(%rdi), %r8
 548         mov    %r8, -0x34(%rsi)
 549 L(P4Q5):
 550         mov    -0x2c(%rdi), %rcx
 551         mov    %rcx, -0x2c(%rsi)
 552 L(P4Q4):
 553         mov    -0x24(%rdi), %r10
 554         mov    %r10, -0x24(%rsi)
 555 L(P4Q3):
 556         mov    -0x1c(%rdi), %r8
 557         mov    %r8, -0x1c(%rsi)
 558 L(P4Q2):
 559         mov    -0x14(%rdi), %rcx
 560         mov    %rcx, -0x14(%rsi)
 561 L(P4Q1):
 562         mov    -0xc(%rdi), %r10
 563         mov    %r10, -0xc(%rsi)
 564 L(P4Q0):
 565         mov    -0x4(%rdi), %r8d
 566         mov    %r8d, -0x4(%rsi)
 567         ret
 568 
 569         .p2align 4
 570 L(P5Q9):
 571         mov    -0x4d(%rdi), %r8
 572         mov    %r8, -0x4d(%rsi)
 573 L(P5Q8):
 574         mov    -0x45(%rdi), %rcx
 575         mov    %rcx, -0x45(%rsi)
 576 L(P5Q7):
 577         mov    -0x3d(%rdi), %r10
 578         mov    %r10, -0x3d(%rsi)
 579 L(P5Q6):
 580         mov    -0x35(%rdi), %r8
 581         mov    %r8, -0x35(%rsi)
 582 L(P5Q5):
 583         mov    -0x2d(%rdi), %rcx
 584         mov    %rcx, -0x2d(%rsi)
 585 L(P5Q4):
 586         mov    -0x25(%rdi), %r10
 587         mov    %r10, -0x25(%rsi)
 588 L(P5Q3):
 589         mov    -0x1d(%rdi), %r8
 590         mov    %r8, -0x1d(%rsi)
 591 L(P5Q2):
 592         mov    -0x15(%rdi), %rcx
 593         mov    %rcx, -0x15(%rsi)
 594 L(P5Q1):
 595         mov    -0xd(%rdi), %r10
 596         mov    %r10, -0xd(%rsi)
 597 L(P5Q0):
 598         mov    -0x5(%rdi), %r8d
 599         movzbq -0x1(%rdi), %r10
 600         mov    %r8d, -0x5(%rsi)
 601         mov    %r10b, -0x1(%rsi)
 602         ret
 603 
 604         .p2align 4
 605 L(P6Q9):
 606         mov    -0x4e(%rdi), %r8
 607         mov    %r8, -0x4e(%rsi)
 608 L(P6Q8):
 609         mov    -0x46(%rdi), %rcx
 610         mov    %rcx, -0x46(%rsi)
 611 L(P6Q7):
 612         mov    -0x3e(%rdi), %r10
 613         mov    %r10, -0x3e(%rsi)
 614 L(P6Q6):
 615         mov    -0x36(%rdi), %r8
 616         mov    %r8, -0x36(%rsi)
 617 L(P6Q5):
 618         mov    -0x2e(%rdi), %rcx
 619         mov    %rcx, -0x2e(%rsi)
 620 L(P6Q4):
 621         mov    -0x26(%rdi), %r10
 622         mov    %r10, -0x26(%rsi)
 623 L(P6Q3):
 624         mov    -0x1e(%rdi), %r8
 625         mov    %r8, -0x1e(%rsi)
 626 L(P6Q2):
 627         mov    -0x16(%rdi), %rcx
 628         mov    %rcx, -0x16(%rsi)
 629 L(P6Q1):
 630         mov    -0xe(%rdi), %r10
 631         mov    %r10, -0xe(%rsi)
 632 L(P6Q0):
 633         mov    -0x6(%rdi), %r8d
 634         movzwq -0x2(%rdi), %r10
 635         mov    %r8d, -0x6(%rsi)
 636         mov    %r10w, -0x2(%rsi)
 637         ret
 638 
 639         .p2align 4
 640 L(P7Q9):
 641         mov    -0x4f(%rdi), %r8
 642         mov    %r8, -0x4f(%rsi)
 643 L(P7Q8):
 644         mov    -0x47(%rdi), %rcx
 645         mov    %rcx, -0x47(%rsi)
 646 L(P7Q7):
 647         mov    -0x3f(%rdi), %r10
 648         mov    %r10, -0x3f(%rsi)
 649 L(P7Q6):
 650         mov    -0x37(%rdi), %r8
 651         mov    %r8, -0x37(%rsi)
 652 L(P7Q5):
 653         mov    -0x2f(%rdi), %rcx
 654         mov    %rcx, -0x2f(%rsi)
 655 L(P7Q4):
 656         mov    -0x27(%rdi), %r10
 657         mov    %r10, -0x27(%rsi)
 658 L(P7Q3):
 659         mov    -0x1f(%rdi), %r8
 660         mov    %r8, -0x1f(%rsi)
 661 L(P7Q2):
 662         mov    -0x17(%rdi), %rcx
 663         mov    %rcx, -0x17(%rsi)
 664 L(P7Q1):
 665         mov    -0xf(%rdi), %r10
 666         mov    %r10, -0xf(%rsi)
 667 L(P7Q0):
 668         mov    -0x7(%rdi), %r8d
 669         movzwq -0x3(%rdi), %r10
 670         movzbq -0x1(%rdi), %rcx
 671         mov    %r8d, -0x7(%rsi)
 672         mov    %r10w, -0x3(%rsi)
 673         mov    %cl, -0x1(%rsi)
 674         ret
 675 
 676         /*
 677          * For large sizes rep smovq is fastest.
 678          * Transition point determined experimentally as measured on
 679          * Intel Xeon processors (incl. Nehalem and previous generations) and
 680          * AMD Opteron. The transition value is patched at boot time to avoid
 681          * memory reference hit.
 682          */
 683         .globl bcopy_patch_start
 684 bcopy_patch_start:
 685         cmpq    $BCOPY_NHM_REP, %rdx
 686         .globl bcopy_patch_end
 687 bcopy_patch_end:
 688 
 689         .p2align 4
 690         ALTENTRY(bcopy_ck_size)
 691 
 692         cmpq    $BCOPY_DFLT_REP, %rdx
 693         jae     L(use_rep)
 694 
 695         /*
 696          * Align to a 8-byte boundary. Avoids penalties from unaligned stores
 697          * as well as from stores spanning cachelines.
 698          */
 699         test    $0x7, %rsi
 700         jz      L(aligned_loop)
 701         test    $0x1, %rsi
 702         jz      2f
 703         movzbq  (%rdi), %r8
 704         dec     %rdx
 705         inc     %rdi
 706         mov     %r8b, (%rsi)
 707         inc     %rsi
 708 2:
 709         test    $0x2, %rsi
 710         jz      4f
 711         movzwq  (%rdi), %r8
 712         sub     $0x2, %rdx
 713         add     $0x2, %rdi
 714         mov     %r8w, (%rsi)
 715         add     $0x2, %rsi
 716 4:
 717         test    $0x4, %rsi
 718         jz      L(aligned_loop)
 719         mov     (%rdi), %r8d
 720         sub     $0x4, %rdx
 721         add     $0x4, %rdi
 722         mov     %r8d, (%rsi)
 723         add     $0x4, %rsi
 724 
 725         /*
 726          * Copy 64-bytes per loop
 727          */
 728         .p2align 4
 729 L(aligned_loop):
 730         mov     (%rdi), %r8
 731         mov     0x8(%rdi), %r10
 732         lea     -0x40(%rdx), %rdx
 733         mov     %r8, (%rsi)
 734         mov     %r10, 0x8(%rsi)
 735         mov     0x10(%rdi), %rcx
 736         mov     0x18(%rdi), %r8
 737         mov     %rcx, 0x10(%rsi)
 738         mov     %r8, 0x18(%rsi)
 739 
 740         cmp     $0x40, %rdx
 741         mov     0x20(%rdi), %r10
 742         mov     0x28(%rdi), %rcx
 743         mov     %r10, 0x20(%rsi)
 744         mov     %rcx, 0x28(%rsi)
 745         mov     0x30(%rdi), %r8
 746         mov     0x38(%rdi), %r10
 747         lea     0x40(%rdi), %rdi
 748         mov     %r8, 0x30(%rsi)
 749         mov     %r10, 0x38(%rsi)
 750         lea     0x40(%rsi), %rsi
 751         jae     L(aligned_loop)
 752 
 753         /*
 754          * Copy remaining bytes (0-63)
 755          */
 756 L(do_remainder):
 757         leaq    L(fwdPxQx)(%rip), %r10
 758         addq    %rdx, %rdi
 759         addq    %rdx, %rsi
 760         movslq  (%r10,%rdx,4), %rcx
 761         leaq    (%rcx,%r10,1), %r10
 762         INDIRECT_JMP_REG(r10)
 763 
 764         /*
 765          * Use rep smovq. Clear remainder via unrolled code
 766          */
 767         .p2align 4
 768 L(use_rep):
 769         xchgq   %rdi, %rsi              /* %rsi = source, %rdi = destination */
 770         movq    %rdx, %rcx              /* %rcx = count */
 771         shrq    $3, %rcx                /* 8-byte word count */
 772         rep
 773           smovq
 774 
 775         xchgq   %rsi, %rdi              /* %rdi = src, %rsi = destination */
 776         andq    $7, %rdx                /* remainder */
 777         jnz     L(do_remainder)
 778         ret
 779 #undef  L
 780         SET_SIZE(bcopy_ck_size)
 781 
 782 #ifdef DEBUG
 783         /*
 784          * Setup frame on the run-time stack. The end of the input argument
 785          * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
 786          * always points to the end of the latest allocated stack frame.
 787          * panic(const char *format, ...) is a varargs function. When a
 788          * function taking variable arguments is called, %rax must be set
 789          * to eight times the number of floating point parameters passed
 790          * to the function in SSE registers.
 791          */
 792 call_panic:
 793         pushq   %rbp                    /* align stack properly */
 794         movq    %rsp, %rbp
 795         xorl    %eax, %eax              /* no variable arguments */
 796         call    panic                   /* %rdi = format string */
 797 #endif
 798         SET_SIZE(bcopy_altentry)
 799         SET_SIZE(bcopy)
 800 
 801 
 802 /*
 803  * Zero a block of storage, returning an error code if we
 804  * take a kernel pagefault which cannot be resolved.
 805  * Returns errno value on pagefault error, 0 if all ok
 806  */
 807 
 808         ENTRY(kzero)
 809 #ifdef DEBUG
 810         cmpq    postbootkernelbase(%rip), %rdi  /* %rdi = addr */
 811         jnb     0f
 812         leaq    .kzero_panic_msg(%rip), %rdi
 813         jmp     call_panic              /* setup stack and call panic */
 814 0:
 815 #endif
 816         /*
 817          * pass lofault value as 3rd argument for fault return
 818          */
 819         leaq    _kzeroerr(%rip), %rdx
 820 
 821         movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
 822         movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
 823         movq    %rdx, T_LOFAULT(%r9)    /* new lofault */
 824         call    bzero_altentry
 825         xorl    %eax, %eax
 826         movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
 827         ret
 828         /*
 829          * A fault during bzero is indicated through an errno value
 830          * in %rax when we iretq to here.
 831          */
 832 _kzeroerr:
 833         addq    $8, %rsp                /* pop bzero_altentry call ret addr */
 834         movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
 835         ret
 836         SET_SIZE(kzero)
 837 
 838 /*
 839  * Zero a block of storage.
 840  */
 841 
 842         ENTRY(bzero)
 843 #ifdef DEBUG
 844         cmpq    postbootkernelbase(%rip), %rdi  /* %rdi = addr */
 845         jnb     0f
 846         leaq    .bzero_panic_msg(%rip), %rdi
 847         jmp     call_panic              /* setup stack and call panic */
 848 0:
 849 #endif
 850         ALTENTRY(bzero_altentry)
 851 do_zero:
 852 #define L(s) .bzero/**/s
 853         xorl    %eax, %eax
 854 
 855         cmpq    $0x50, %rsi             /* 80 */
 856         jae     L(ck_align)
 857 
 858         /*
 859          * Performance data shows many caller's are zeroing small buffers. So
 860          * for best perf for these sizes unrolled code is used. Store zeros
 861          * without worrying about alignment.
 862          */
 863         leaq    L(setPxQx)(%rip), %r10
 864         addq    %rsi, %rdi
 865         movslq  (%r10,%rsi,4), %rcx
 866         leaq    (%rcx,%r10,1), %r10
 867         INDIRECT_JMP_REG(r10)
 868 
 869         .p2align 4
 870 L(setPxQx):
 871         .int       L(P0Q0)-L(setPxQx)   /* 0 */
 872         .int       L(P1Q0)-L(setPxQx)
 873         .int       L(P2Q0)-L(setPxQx)
 874         .int       L(P3Q0)-L(setPxQx)
 875         .int       L(P4Q0)-L(setPxQx)
 876         .int       L(P5Q0)-L(setPxQx)
 877         .int       L(P6Q0)-L(setPxQx)
 878         .int       L(P7Q0)-L(setPxQx)
 879 
 880         .int       L(P0Q1)-L(setPxQx)   /* 8 */
 881         .int       L(P1Q1)-L(setPxQx)
 882         .int       L(P2Q1)-L(setPxQx)
 883         .int       L(P3Q1)-L(setPxQx)
 884         .int       L(P4Q1)-L(setPxQx)
 885         .int       L(P5Q1)-L(setPxQx)
 886         .int       L(P6Q1)-L(setPxQx)
 887         .int       L(P7Q1)-L(setPxQx)
 888 
 889         .int       L(P0Q2)-L(setPxQx)   /* 16 */
 890         .int       L(P1Q2)-L(setPxQx)
 891         .int       L(P2Q2)-L(setPxQx)
 892         .int       L(P3Q2)-L(setPxQx)
 893         .int       L(P4Q2)-L(setPxQx)
 894         .int       L(P5Q2)-L(setPxQx)
 895         .int       L(P6Q2)-L(setPxQx)
 896         .int       L(P7Q2)-L(setPxQx)
 897 
 898         .int       L(P0Q3)-L(setPxQx)   /* 24 */
 899         .int       L(P1Q3)-L(setPxQx)
 900         .int       L(P2Q3)-L(setPxQx)
 901         .int       L(P3Q3)-L(setPxQx)
 902         .int       L(P4Q3)-L(setPxQx)
 903         .int       L(P5Q3)-L(setPxQx)
 904         .int       L(P6Q3)-L(setPxQx)
 905         .int       L(P7Q3)-L(setPxQx)
 906 
 907         .int       L(P0Q4)-L(setPxQx)   /* 32 */
 908         .int       L(P1Q4)-L(setPxQx)
 909         .int       L(P2Q4)-L(setPxQx)
 910         .int       L(P3Q4)-L(setPxQx)
 911         .int       L(P4Q4)-L(setPxQx)
 912         .int       L(P5Q4)-L(setPxQx)
 913         .int       L(P6Q4)-L(setPxQx)
 914         .int       L(P7Q4)-L(setPxQx)
 915 
 916         .int       L(P0Q5)-L(setPxQx)   /* 40 */
 917         .int       L(P1Q5)-L(setPxQx)
 918         .int       L(P2Q5)-L(setPxQx)
 919         .int       L(P3Q5)-L(setPxQx)
 920         .int       L(P4Q5)-L(setPxQx)
 921         .int       L(P5Q5)-L(setPxQx)
 922         .int       L(P6Q5)-L(setPxQx)
 923         .int       L(P7Q5)-L(setPxQx)
 924 
 925         .int       L(P0Q6)-L(setPxQx)   /* 48 */
 926         .int       L(P1Q6)-L(setPxQx)
 927         .int       L(P2Q6)-L(setPxQx)
 928         .int       L(P3Q6)-L(setPxQx)
 929         .int       L(P4Q6)-L(setPxQx)
 930         .int       L(P5Q6)-L(setPxQx)
 931         .int       L(P6Q6)-L(setPxQx)
 932         .int       L(P7Q6)-L(setPxQx)
 933 
 934         .int       L(P0Q7)-L(setPxQx)   /* 56 */
 935         .int       L(P1Q7)-L(setPxQx)
 936         .int       L(P2Q7)-L(setPxQx)
 937         .int       L(P3Q7)-L(setPxQx)
 938         .int       L(P4Q7)-L(setPxQx)
 939         .int       L(P5Q7)-L(setPxQx)
 940         .int       L(P6Q7)-L(setPxQx)
 941         .int       L(P7Q7)-L(setPxQx)
 942 
 943         .int       L(P0Q8)-L(setPxQx)   /* 64 */
 944         .int       L(P1Q8)-L(setPxQx)
 945         .int       L(P2Q8)-L(setPxQx)
 946         .int       L(P3Q8)-L(setPxQx)
 947         .int       L(P4Q8)-L(setPxQx)
 948         .int       L(P5Q8)-L(setPxQx)
 949         .int       L(P6Q8)-L(setPxQx)
 950         .int       L(P7Q8)-L(setPxQx)
 951 
 952         .int       L(P0Q9)-L(setPxQx)   /* 72 */
 953         .int       L(P1Q9)-L(setPxQx)
 954         .int       L(P2Q9)-L(setPxQx)
 955         .int       L(P3Q9)-L(setPxQx)
 956         .int       L(P4Q9)-L(setPxQx)
 957         .int       L(P5Q9)-L(setPxQx)
 958         .int       L(P6Q9)-L(setPxQx)
 959         .int       L(P7Q9)-L(setPxQx)   /* 79 */
 960 
 961         .p2align 4
 962 L(P0Q9): mov    %rax, -0x48(%rdi)
 963 L(P0Q8): mov    %rax, -0x40(%rdi)
 964 L(P0Q7): mov    %rax, -0x38(%rdi)
 965 L(P0Q6): mov    %rax, -0x30(%rdi)
 966 L(P0Q5): mov    %rax, -0x28(%rdi)
 967 L(P0Q4): mov    %rax, -0x20(%rdi)
 968 L(P0Q3): mov    %rax, -0x18(%rdi)
 969 L(P0Q2): mov    %rax, -0x10(%rdi)
 970 L(P0Q1): mov    %rax, -0x8(%rdi)
 971 L(P0Q0):
 972          ret
 973 
 974         .p2align 4
 975 L(P1Q9): mov    %rax, -0x49(%rdi)
 976 L(P1Q8): mov    %rax, -0x41(%rdi)
 977 L(P1Q7): mov    %rax, -0x39(%rdi)
 978 L(P1Q6): mov    %rax, -0x31(%rdi)
 979 L(P1Q5): mov    %rax, -0x29(%rdi)
 980 L(P1Q4): mov    %rax, -0x21(%rdi)
 981 L(P1Q3): mov    %rax, -0x19(%rdi)
 982 L(P1Q2): mov    %rax, -0x11(%rdi)
 983 L(P1Q1): mov    %rax, -0x9(%rdi)
 984 L(P1Q0): mov    %al, -0x1(%rdi)
 985          ret
 986 
 987         .p2align 4
 988 L(P2Q9): mov    %rax, -0x4a(%rdi)
 989 L(P2Q8): mov    %rax, -0x42(%rdi)
 990 L(P2Q7): mov    %rax, -0x3a(%rdi)
 991 L(P2Q6): mov    %rax, -0x32(%rdi)
 992 L(P2Q5): mov    %rax, -0x2a(%rdi)
 993 L(P2Q4): mov    %rax, -0x22(%rdi)
 994 L(P2Q3): mov    %rax, -0x1a(%rdi)
 995 L(P2Q2): mov    %rax, -0x12(%rdi)
 996 L(P2Q1): mov    %rax, -0xa(%rdi)
 997 L(P2Q0): mov    %ax, -0x2(%rdi)
 998          ret
 999 
1000         .p2align 4
1001 L(P3Q9): mov    %rax, -0x4b(%rdi)
1002 L(P3Q8): mov    %rax, -0x43(%rdi)
1003 L(P3Q7): mov    %rax, -0x3b(%rdi)
1004 L(P3Q6): mov    %rax, -0x33(%rdi)
1005 L(P3Q5): mov    %rax, -0x2b(%rdi)
1006 L(P3Q4): mov    %rax, -0x23(%rdi)
1007 L(P3Q3): mov    %rax, -0x1b(%rdi)
1008 L(P3Q2): mov    %rax, -0x13(%rdi)
1009 L(P3Q1): mov    %rax, -0xb(%rdi)
1010 L(P3Q0): mov    %ax, -0x3(%rdi)
1011          mov    %al, -0x1(%rdi)
1012          ret
1013 
1014         .p2align 4
1015 L(P4Q9): mov    %rax, -0x4c(%rdi)
1016 L(P4Q8): mov    %rax, -0x44(%rdi)
1017 L(P4Q7): mov    %rax, -0x3c(%rdi)
1018 L(P4Q6): mov    %rax, -0x34(%rdi)
1019 L(P4Q5): mov    %rax, -0x2c(%rdi)
1020 L(P4Q4): mov    %rax, -0x24(%rdi)
1021 L(P4Q3): mov    %rax, -0x1c(%rdi)
1022 L(P4Q2): mov    %rax, -0x14(%rdi)
1023 L(P4Q1): mov    %rax, -0xc(%rdi)
1024 L(P4Q0): mov    %eax, -0x4(%rdi)
1025          ret
1026 
1027         .p2align 4
1028 L(P5Q9): mov    %rax, -0x4d(%rdi)
1029 L(P5Q8): mov    %rax, -0x45(%rdi)
1030 L(P5Q7): mov    %rax, -0x3d(%rdi)
1031 L(P5Q6): mov    %rax, -0x35(%rdi)
1032 L(P5Q5): mov    %rax, -0x2d(%rdi)
1033 L(P5Q4): mov    %rax, -0x25(%rdi)
1034 L(P5Q3): mov    %rax, -0x1d(%rdi)
1035 L(P5Q2): mov    %rax, -0x15(%rdi)
1036 L(P5Q1): mov    %rax, -0xd(%rdi)
1037 L(P5Q0): mov    %eax, -0x5(%rdi)
1038          mov    %al, -0x1(%rdi)
1039          ret
1040 
1041         .p2align 4
1042 L(P6Q9): mov    %rax, -0x4e(%rdi)
1043 L(P6Q8): mov    %rax, -0x46(%rdi)
1044 L(P6Q7): mov    %rax, -0x3e(%rdi)
1045 L(P6Q6): mov    %rax, -0x36(%rdi)
1046 L(P6Q5): mov    %rax, -0x2e(%rdi)
1047 L(P6Q4): mov    %rax, -0x26(%rdi)
1048 L(P6Q3): mov    %rax, -0x1e(%rdi)
1049 L(P6Q2): mov    %rax, -0x16(%rdi)
1050 L(P6Q1): mov    %rax, -0xe(%rdi)
1051 L(P6Q0): mov    %eax, -0x6(%rdi)
1052          mov    %ax, -0x2(%rdi)
1053          ret
1054 
1055         .p2align 4
1056 L(P7Q9): mov    %rax, -0x4f(%rdi)
1057 L(P7Q8): mov    %rax, -0x47(%rdi)
1058 L(P7Q7): mov    %rax, -0x3f(%rdi)
1059 L(P7Q6): mov    %rax, -0x37(%rdi)
1060 L(P7Q5): mov    %rax, -0x2f(%rdi)
1061 L(P7Q4): mov    %rax, -0x27(%rdi)
1062 L(P7Q3): mov    %rax, -0x1f(%rdi)
1063 L(P7Q2): mov    %rax, -0x17(%rdi)
1064 L(P7Q1): mov    %rax, -0xf(%rdi)
1065 L(P7Q0): mov    %eax, -0x7(%rdi)
1066          mov    %ax, -0x3(%rdi)
1067          mov    %al, -0x1(%rdi)
1068          ret
1069 
1070         /*
1071          * Align to a 16-byte boundary. Avoids penalties from unaligned stores
1072          * as well as from stores spanning cachelines. Note 16-byte alignment
1073          * is better in case where rep sstosq is used.
1074          */
1075         .p2align 4
1076 L(ck_align):
1077         test    $0xf, %rdi
1078         jz      L(aligned_now)
1079         test    $1, %rdi
1080         jz      2f
1081         mov     %al, (%rdi)
1082         dec     %rsi
1083         lea     1(%rdi),%rdi
1084 2:
1085         test    $2, %rdi
1086         jz      4f
1087         mov     %ax, (%rdi)
1088         sub     $2, %rsi
1089         lea     2(%rdi),%rdi
1090 4:
1091         test    $4, %rdi
1092         jz      8f
1093         mov     %eax, (%rdi)
1094         sub     $4, %rsi
1095         lea     4(%rdi),%rdi
1096 8:
1097         test    $8, %rdi
1098         jz      L(aligned_now)
1099         mov     %rax, (%rdi)
1100         sub     $8, %rsi
1101         lea     8(%rdi),%rdi
1102 
1103         /*
1104          * For large sizes rep sstoq is fastest.
1105          * Transition point determined experimentally as measured on
1106          * Intel Xeon processors (incl. Nehalem) and AMD Opteron.
1107          */
1108 L(aligned_now):
1109         cmp     $BZERO_USE_REP, %rsi
1110         ja      L(use_rep)
1111 
1112         /*
1113          * zero 64-bytes per loop
1114          */
1115         .p2align 4
1116 L(bzero_loop):
1117         leaq    -0x40(%rsi), %rsi
1118         cmpq    $0x40, %rsi
1119         movq    %rax, (%rdi)
1120         movq    %rax, 0x8(%rdi)
1121         movq    %rax, 0x10(%rdi)
1122         movq    %rax, 0x18(%rdi)
1123         movq    %rax, 0x20(%rdi)
1124         movq    %rax, 0x28(%rdi)
1125         movq    %rax, 0x30(%rdi)
1126         movq    %rax, 0x38(%rdi)
1127         leaq    0x40(%rdi), %rdi
1128         jae     L(bzero_loop)
1129 
1130         /*
1131          * Clear any remaining bytes..
1132          */
1133 9:
1134         leaq    L(setPxQx)(%rip), %r10
1135         addq    %rsi, %rdi
1136         movslq  (%r10,%rsi,4), %rcx
1137         leaq    (%rcx,%r10,1), %r10
1138         INDIRECT_JMP_REG(r10)
1139 
1140         /*
1141          * Use rep sstoq. Clear any remainder via unrolled code
1142          */
1143         .p2align 4
1144 L(use_rep):
1145         movq    %rsi, %rcx              /* get size in bytes */
1146         shrq    $3, %rcx                /* count of 8-byte words to zero */
1147         rep
1148           sstoq                         /* %rcx = words to clear (%rax=0) */
1149         andq    $7, %rsi                /* remaining bytes */
1150         jnz     9b
1151         ret
1152 #undef  L
1153         SET_SIZE(bzero_altentry)
1154         SET_SIZE(bzero)
1155 
1156 /*
1157  * Transfer data to and from user space -
1158  * Note that these routines can cause faults
1159  * It is assumed that the kernel has nothing at
1160  * less than KERNELBASE in the virtual address space.
1161  *
1162  * Note that copyin(9F) and copyout(9F) are part of the
1163  * DDI/DKI which specifies that they return '-1' on "errors."
1164  *
1165  * Sigh.
1166  *
1167  * So there's two extremely similar routines - xcopyin_nta() and
1168  * xcopyout_nta() which return the errno that we've faithfully computed.
1169  * This allows other callers (e.g. uiomove(9F)) to work correctly.
1170  * Given that these are used pretty heavily, we expand the calling
1171  * sequences inline for all flavours (rather than making wrappers).
1172  */
1173 
1174 /*
1175  * Copy user data to kernel space.
1176  */
1177 
1178         ENTRY(copyin)
1179         pushq   %rbp
1180         movq    %rsp, %rbp
1181         subq    $24, %rsp
1182 
1183         /*
1184          * save args in case we trap and need to rerun as a copyop
1185          */
1186         movq    %rdi, (%rsp)
1187         movq    %rsi, 0x8(%rsp)
1188         movq    %rdx, 0x10(%rsp)
1189 
1190         movq    kernelbase(%rip), %rax
1191 #ifdef DEBUG
1192         cmpq    %rax, %rsi              /* %rsi = kaddr */
1193         jnb     1f
1194         leaq    .copyin_panic_msg(%rip), %rdi
1195         xorl    %eax, %eax
1196         call    panic
1197 1:
1198 #endif
1199         /*
1200          * pass lofault value as 4th argument to do_copy_fault
1201          */
1202         leaq    _copyin_err(%rip), %rcx
1203 
1204         movq    %gs:CPU_THREAD, %r9
1205         cmpq    %rax, %rdi              /* test uaddr < kernelbase */
1206         jae     3f                      /* take copyop if uaddr > kernelbase */
1207         SMAP_DISABLE_INSTR(0)
1208         jmp     do_copy_fault           /* Takes care of leave for us */
1209 
1210 _copyin_err:
1211         SMAP_ENABLE_INSTR(2)
1212         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1213         addq    $8, %rsp                /* pop bcopy_altentry call ret addr */
1214 3:
1215         movq    T_COPYOPS(%r9), %rax
1216         cmpq    $0, %rax
1217         jz      2f
1218         /*
1219          * reload args for the copyop
1220          */
1221         movq    (%rsp), %rdi
1222         movq    0x8(%rsp), %rsi
1223         movq    0x10(%rsp), %rdx
1224         leave
1225         movq    CP_COPYIN(%rax), %rax
1226         INDIRECT_JMP_REG(rax)
1227 
1228 2:      movl    $-1, %eax
1229         leave
1230         ret
1231         SET_SIZE(copyin)
1232 
1233         ENTRY(xcopyin_nta)
1234         pushq   %rbp
1235         movq    %rsp, %rbp
1236         subq    $24, %rsp
1237 
1238         /*
1239          * save args in case we trap and need to rerun as a copyop
1240          * %rcx is consumed in this routine so we don't need to save
1241          * it.
1242          */
1243         movq    %rdi, (%rsp)
1244         movq    %rsi, 0x8(%rsp)
1245         movq    %rdx, 0x10(%rsp)
1246 
1247         movq    kernelbase(%rip), %rax
1248 #ifdef DEBUG
1249         cmpq    %rax, %rsi              /* %rsi = kaddr */
1250         jnb     1f
1251         leaq    .xcopyin_panic_msg(%rip), %rdi
1252         xorl    %eax, %eax
1253         call    panic
1254 1:
1255 #endif
1256         movq    %gs:CPU_THREAD, %r9
1257         cmpq    %rax, %rdi              /* test uaddr < kernelbase */
1258         jae     4f
1259         cmpq    $0, %rcx                /* No non-temporal access? */
1260         /*
1261          * pass lofault value as 4th argument to do_copy_fault
1262          */
1263         leaq    _xcopyin_err(%rip), %rcx        /* doesn't set rflags */
1264         jnz     6f                      /* use regular access */
1265         /*
1266          * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1267          */
1268         cmpq    $XCOPY_MIN_SIZE, %rdx
1269         jae     5f
1270 6:
1271         SMAP_DISABLE_INSTR(1)
1272         jmp     do_copy_fault
1273 
1274         /*
1275          * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1276          * count is COUNT_ALIGN_SIZE aligned.
1277          */
1278 5:
1279         movq    %rdi, %r10
1280         orq     %rsi, %r10
1281         andq    $NTA_ALIGN_MASK, %r10
1282         orq     %rdx, %r10
1283         andq    $COUNT_ALIGN_MASK, %r10
1284         jnz     6b
1285         leaq    _xcopyin_nta_err(%rip), %rcx    /* doesn't set rflags */
1286         SMAP_DISABLE_INSTR(2)
1287         jmp     do_copy_fault_nta       /* use non-temporal access */
1288 
1289 4:
1290         movl    $EFAULT, %eax
1291         jmp     3f
1292 
1293         /*
1294          * A fault during do_copy_fault or do_copy_fault_nta is
1295          * indicated through an errno value in %rax and we iret from the
1296          * trap handler to here.
1297          */
1298 _xcopyin_err:
1299         addq    $8, %rsp                /* pop bcopy_altentry call ret addr */
1300 _xcopyin_nta_err:
1301         SMAP_ENABLE_INSTR(3)
1302         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1303 3:
1304         movq    T_COPYOPS(%r9), %r8
1305         cmpq    $0, %r8
1306         jz      2f
1307 
1308         /*
1309          * reload args for the copyop
1310          */
1311         movq    (%rsp), %rdi
1312         movq    0x8(%rsp), %rsi
1313         movq    0x10(%rsp), %rdx
1314         leave
1315         movq    CP_XCOPYIN(%r8), %r8
1316         INDIRECT_JMP_REG(r8)
1317 
1318 2:      leave
1319         ret
1320         SET_SIZE(xcopyin_nta)
1321 
1322 /*
1323  * Copy kernel data to user space.
1324  */
1325 
1326         ENTRY(copyout)
1327         pushq   %rbp
1328         movq    %rsp, %rbp
1329         subq    $24, %rsp
1330 
1331         /*
1332          * save args in case we trap and need to rerun as a copyop
1333          */
1334         movq    %rdi, (%rsp)
1335         movq    %rsi, 0x8(%rsp)
1336         movq    %rdx, 0x10(%rsp)
1337 
1338         movq    kernelbase(%rip), %rax
1339 #ifdef DEBUG
1340         cmpq    %rax, %rdi              /* %rdi = kaddr */
1341         jnb     1f
1342         leaq    .copyout_panic_msg(%rip), %rdi
1343         xorl    %eax, %eax
1344         call    panic
1345 1:
1346 #endif
1347         /*
1348          * pass lofault value as 4th argument to do_copy_fault
1349          */
1350         leaq    _copyout_err(%rip), %rcx
1351 
1352         movq    %gs:CPU_THREAD, %r9
1353         cmpq    %rax, %rsi              /* test uaddr < kernelbase */
1354         jae     3f                      /* take copyop if uaddr > kernelbase */
1355         SMAP_DISABLE_INSTR(3)
1356         jmp     do_copy_fault           /* Calls leave for us */
1357 
1358 _copyout_err:
1359         SMAP_ENABLE_INSTR(4)
1360         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1361         addq    $8, %rsp                /* pop bcopy_altentry call ret addr */
1362 3:
1363         movq    T_COPYOPS(%r9), %rax
1364         cmpq    $0, %rax
1365         jz      2f
1366 
1367         /*
1368          * reload args for the copyop
1369          */
1370         movq    (%rsp), %rdi
1371         movq    0x8(%rsp), %rsi
1372         movq    0x10(%rsp), %rdx
1373         leave
1374         movq    CP_COPYOUT(%rax), %rax
1375         INDIRECT_JMP_REG(rax)
1376 
1377 2:      movl    $-1, %eax
1378         leave
1379         ret
1380         SET_SIZE(copyout)
1381 
1382         ENTRY(xcopyout_nta)
1383         pushq   %rbp
1384         movq    %rsp, %rbp
1385         subq    $24, %rsp
1386 
1387         /*
1388          * save args in case we trap and need to rerun as a copyop
1389          */
1390         movq    %rdi, (%rsp)
1391         movq    %rsi, 0x8(%rsp)
1392         movq    %rdx, 0x10(%rsp)
1393 
1394         movq    kernelbase(%rip), %rax
1395 #ifdef DEBUG
1396         cmpq    %rax, %rdi              /* %rdi = kaddr */
1397         jnb     1f
1398         leaq    .xcopyout_panic_msg(%rip), %rdi
1399         xorl    %eax, %eax
1400         call    panic
1401 1:
1402 #endif
1403         movq    %gs:CPU_THREAD, %r9
1404         cmpq    %rax, %rsi              /* test uaddr < kernelbase */
1405         jae     4f
1406 
1407         cmpq    $0, %rcx                /* No non-temporal access? */
1408         /*
1409          * pass lofault value as 4th argument to do_copy_fault
1410          */
1411         leaq    _xcopyout_err(%rip), %rcx
1412         jnz     6f
1413         /*
1414          * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1415          */
1416         cmpq    $XCOPY_MIN_SIZE, %rdx
1417         jae     5f
1418 6:
1419         SMAP_DISABLE_INSTR(4)
1420         jmp     do_copy_fault
1421 
1422         /*
1423          * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1424          * count is COUNT_ALIGN_SIZE aligned.
1425          */
1426 5:
1427         movq    %rdi, %r10
1428         orq     %rsi, %r10
1429         andq    $NTA_ALIGN_MASK, %r10
1430         orq     %rdx, %r10
1431         andq    $COUNT_ALIGN_MASK, %r10
1432         jnz     6b
1433         leaq    _xcopyout_nta_err(%rip), %rcx
1434         SMAP_DISABLE_INSTR(5)
1435         call    do_copy_fault_nta
1436         SMAP_ENABLE_INSTR(5)
1437         ret
1438 
1439 4:
1440         movl    $EFAULT, %eax
1441         jmp     3f
1442 
1443         /*
1444          * A fault during do_copy_fault or do_copy_fault_nta is
1445          * indicated through an errno value in %rax and we iret from the
1446          * trap handler to here.
1447          */
1448 _xcopyout_err:
1449         addq    $8, %rsp                /* pop bcopy_altentry call ret addr */
1450 _xcopyout_nta_err:
1451         SMAP_ENABLE_INSTR(6)
1452         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1453 3:
1454         movq    T_COPYOPS(%r9), %r8
1455         cmpq    $0, %r8
1456         jz      2f
1457 
1458         /*
1459          * reload args for the copyop
1460          */
1461         movq    (%rsp), %rdi
1462         movq    0x8(%rsp), %rsi
1463         movq    0x10(%rsp), %rdx
1464         leave
1465         movq    CP_XCOPYOUT(%r8), %r8
1466         INDIRECT_JMP_REG(r8)
1467 
1468 2:      leave
1469         ret
1470         SET_SIZE(xcopyout_nta)
1471 
1472 /*
1473  * Copy a null terminated string from one point to another in
1474  * the kernel address space.
1475  */
1476 
1477         ENTRY(copystr)
1478         pushq   %rbp
1479         movq    %rsp, %rbp
1480 #ifdef DEBUG
1481         movq    kernelbase(%rip), %rax
1482         cmpq    %rax, %rdi              /* %rdi = from */
1483         jb      0f
1484         cmpq    %rax, %rsi              /* %rsi = to */
1485         jnb     1f
1486 0:      leaq    .copystr_panic_msg(%rip), %rdi
1487         xorl    %eax, %eax
1488         call    panic
1489 1:
1490 #endif
1491         movq    %gs:CPU_THREAD, %r9
1492         movq    T_LOFAULT(%r9), %r8     /* pass current lofault value as */
1493                                         /* 5th argument to do_copystr */
1494         xorl    %r10d,%r10d             /* pass smap restore need in %r10d */
1495                                         /* as a non-ABI 6th arg */
1496 do_copystr:
1497         movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
1498         movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
1499         movq    %r8, T_LOFAULT(%r9)     /* new lofault */
1500 
1501         movq    %rdx, %r8               /* save maxlength */
1502 
1503         cmpq    $0, %rdx                /* %rdx = maxlength */
1504         je      copystr_enametoolong    /* maxlength == 0 */
1505 
1506 copystr_loop:
1507         decq    %r8
1508         movb    (%rdi), %al
1509         incq    %rdi
1510         movb    %al, (%rsi)
1511         incq    %rsi
1512         cmpb    $0, %al
1513         je      copystr_null            /* null char */
1514         cmpq    $0, %r8
1515         jne     copystr_loop
1516 
1517 copystr_enametoolong:
1518         movl    $ENAMETOOLONG, %eax
1519         jmp     copystr_out
1520 
1521 copystr_null:
1522         xorl    %eax, %eax              /* no error */
1523 
1524 copystr_out:
1525         cmpq    $0, %rcx                /* want length? */
1526         je      copystr_smap            /* no */
1527         subq    %r8, %rdx               /* compute length and store it */
1528         movq    %rdx, (%rcx)
1529 
1530 copystr_smap:
1531         cmpl    $0, %r10d
1532         jz      copystr_done
1533         SMAP_ENABLE_INSTR(7)
1534 
1535 copystr_done:
1536         movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
1537         leave
1538         ret
1539         SET_SIZE(copystr)
1540 
1541 /*
1542  * Copy a null terminated string from the user address space into
1543  * the kernel address space.
1544  */
1545 
1546         ENTRY(copyinstr)
1547         pushq   %rbp
1548         movq    %rsp, %rbp
1549         subq    $32, %rsp
1550 
1551         /*
1552          * save args in case we trap and need to rerun as a copyop
1553          */
1554         movq    %rdi, (%rsp)
1555         movq    %rsi, 0x8(%rsp)
1556         movq    %rdx, 0x10(%rsp)
1557         movq    %rcx, 0x18(%rsp)
1558 
1559         movq    kernelbase(%rip), %rax
1560 #ifdef DEBUG
1561         cmpq    %rax, %rsi              /* %rsi = kaddr */
1562         jnb     1f
1563         leaq    .copyinstr_panic_msg(%rip), %rdi
1564         xorl    %eax, %eax
1565         call    panic
1566 1:
1567 #endif
1568         /*
1569          * pass lofault value as 5th argument to do_copystr
1570          * do_copystr expects whether or not we need smap in %r10d
1571          */
1572         leaq    _copyinstr_error(%rip), %r8
1573         movl    $1, %r10d
1574 
1575         cmpq    %rax, %rdi              /* test uaddr < kernelbase */
1576         jae     4f
1577         SMAP_DISABLE_INSTR(6)
1578         jmp     do_copystr
1579 4:
1580         movq    %gs:CPU_THREAD, %r9
1581         jmp     3f
1582 
1583 _copyinstr_error:
1584         SMAP_ENABLE_INSTR(8)
1585         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1586 3:
1587         movq    T_COPYOPS(%r9), %rax
1588         cmpq    $0, %rax
1589         jz      2f
1590 
1591         /*
1592          * reload args for the copyop
1593          */
1594         movq    (%rsp), %rdi
1595         movq    0x8(%rsp), %rsi
1596         movq    0x10(%rsp), %rdx
1597         movq    0x18(%rsp), %rcx
1598         leave
1599         movq    CP_COPYINSTR(%rax), %rax
1600         INDIRECT_JMP_REG(rax)
1601 
1602 2:      movl    $EFAULT, %eax           /* return EFAULT */
1603         leave
1604         ret
1605         SET_SIZE(copyinstr)
1606 
1607 /*
1608  * Copy a null terminated string from the kernel
1609  * address space to the user address space.
1610  */
1611 
1612         ENTRY(copyoutstr)
1613         pushq   %rbp
1614         movq    %rsp, %rbp
1615         subq    $32, %rsp
1616 
1617         /*
1618          * save args in case we trap and need to rerun as a copyop
1619          */
1620         movq    %rdi, (%rsp)
1621         movq    %rsi, 0x8(%rsp)
1622         movq    %rdx, 0x10(%rsp)
1623         movq    %rcx, 0x18(%rsp)
1624 
1625         movq    kernelbase(%rip), %rax
1626 #ifdef DEBUG
1627         cmpq    %rax, %rdi              /* %rdi = kaddr */
1628         jnb     1f
1629         leaq    .copyoutstr_panic_msg(%rip), %rdi
1630         jmp     call_panic              /* setup stack and call panic */
1631 1:
1632 #endif
1633         /*
1634          * pass lofault value as 5th argument to do_copystr
1635          * pass one as 6th argument to do_copystr in %r10d
1636          */
1637         leaq    _copyoutstr_error(%rip), %r8
1638         movl    $1, %r10d
1639 
1640         cmpq    %rax, %rsi              /* test uaddr < kernelbase */
1641         jae     4f
1642         SMAP_DISABLE_INSTR(7)
1643         jmp     do_copystr
1644 4:
1645         movq    %gs:CPU_THREAD, %r9
1646         jmp     3f
1647 
1648 _copyoutstr_error:
1649         SMAP_ENABLE_INSTR(9)
1650         movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
1651 3:
1652         movq    T_COPYOPS(%r9), %rax
1653         cmpq    $0, %rax
1654         jz      2f
1655 
1656         /*
1657          * reload args for the copyop
1658          */
1659         movq    (%rsp), %rdi
1660         movq    0x8(%rsp), %rsi
1661         movq    0x10(%rsp), %rdx
1662         movq    0x18(%rsp), %rcx
1663         leave
1664         movq    CP_COPYOUTSTR(%rax), %rax
1665         INDIRECT_JMP_REG(rax)
1666 
1667 2:      movl    $EFAULT, %eax           /* return EFAULT */
1668         leave
1669         ret
1670         SET_SIZE(copyoutstr)
1671 
1672 /*
1673  * Since all of the fuword() variants are so similar, we have a macro to spit
1674  * them out.  This allows us to create DTrace-unobservable functions easily.
1675  */
1676 
1677 /*
1678  * Note that we don't save and reload the arguments here
1679  * because their values are not altered in the copy path.
1680  * Additionally, when successful, the smap_enable jmp will
1681  * actually return us to our original caller.
1682  */
1683 
1684 #define FUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)      \
1685         ENTRY(NAME)                             \
1686         movq    %gs:CPU_THREAD, %r9;            \
1687         cmpq    kernelbase(%rip), %rdi;         \
1688         jae     1f;                             \
1689         leaq    _flt_/**/NAME, %rdx;            \
1690         movq    %rdx, T_LOFAULT(%r9);           \
1691         SMAP_DISABLE_INSTR(DISNUM)              \
1692         INSTR   (%rdi), REG;                    \
1693         movq    $0, T_LOFAULT(%r9);             \
1694         INSTR   REG, (%rsi);                    \
1695         xorl    %eax, %eax;                     \
1696         SMAP_ENABLE_INSTR(EN1)                  \
1697         ret;                                    \
1698 _flt_/**/NAME:                                  \
1699         SMAP_ENABLE_INSTR(EN2)                  \
1700         movq    $0, T_LOFAULT(%r9);             \
1701 1:                                              \
1702         movq    T_COPYOPS(%r9), %rax;           \
1703         cmpq    $0, %rax;                       \
1704         jz      2f;                             \
1705         movq    COPYOP(%rax), %rax;             \
1706         INDIRECT_JMP_REG(rax);                  \
1707 2:                                              \
1708         movl    $-1, %eax;                      \
1709         ret;                                    \
1710         SET_SIZE(NAME)
1711 
1712         FUWORD(fuword64, movq, %rax, CP_FUWORD64,8,10,11)
1713         FUWORD(fuword32, movl, %eax, CP_FUWORD32,9,12,13)
1714         FUWORD(fuword16, movw, %ax, CP_FUWORD16,10,14,15)
1715         FUWORD(fuword8, movb, %al, CP_FUWORD8,11,16,17)
1716 
1717 #undef  FUWORD
1718 
1719 /*
1720  * Set user word.
1721  */
1722 
1723 /*
1724  * Note that we don't save and reload the arguments here
1725  * because their values are not altered in the copy path.
1726  */
1727 
1728 #define SUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)      \
1729         ENTRY(NAME)                             \
1730         movq    %gs:CPU_THREAD, %r9;            \
1731         cmpq    kernelbase(%rip), %rdi;         \
1732         jae     1f;                             \
1733         leaq    _flt_/**/NAME, %rdx;            \
1734         SMAP_DISABLE_INSTR(DISNUM)              \
1735         movq    %rdx, T_LOFAULT(%r9);           \
1736         INSTR   REG, (%rdi);                    \
1737         movq    $0, T_LOFAULT(%r9);             \
1738         xorl    %eax, %eax;                     \
1739         SMAP_ENABLE_INSTR(EN1)                  \
1740         ret;                                    \
1741 _flt_/**/NAME:                                  \
1742         SMAP_ENABLE_INSTR(EN2)                  \
1743         movq    $0, T_LOFAULT(%r9);             \
1744 1:                                              \
1745         movq    T_COPYOPS(%r9), %rax;           \
1746         cmpq    $0, %rax;                       \
1747         jz      3f;                             \
1748         movq    COPYOP(%rax), %rax;             \
1749         INDIRECT_JMP_REG(rax);                  \
1750 3:                                              \
1751         movl    $-1, %eax;                      \
1752         ret;                                    \
1753         SET_SIZE(NAME)
1754 
1755         SUWORD(suword64, movq, %rsi, CP_SUWORD64,12,18,19)
1756         SUWORD(suword32, movl, %esi, CP_SUWORD32,13,20,21)
1757         SUWORD(suword16, movw, %si, CP_SUWORD16,14,22,23)
1758         SUWORD(suword8, movb, %sil, CP_SUWORD8,15,24,25)
1759 
1760 #undef  SUWORD
1761 
1762 #define FUWORD_NOERR(NAME, INSTR, REG)          \
1763         ENTRY(NAME)                             \
1764         cmpq    kernelbase(%rip), %rdi;         \
1765         cmovnbq kernelbase(%rip), %rdi;         \
1766         INSTR   (%rdi), REG;                    \
1767         INSTR   REG, (%rsi);                    \
1768         ret;                                    \
1769         SET_SIZE(NAME)
1770 
1771         FUWORD_NOERR(fuword64_noerr, movq, %rax)
1772         FUWORD_NOERR(fuword32_noerr, movl, %eax)
1773         FUWORD_NOERR(fuword16_noerr, movw, %ax)
1774         FUWORD_NOERR(fuword8_noerr, movb, %al)
1775 
1776 #undef  FUWORD_NOERR
1777 
1778 #define SUWORD_NOERR(NAME, INSTR, REG)          \
1779         ENTRY(NAME)                             \
1780         cmpq    kernelbase(%rip), %rdi;         \
1781         cmovnbq kernelbase(%rip), %rdi;         \
1782         INSTR   REG, (%rdi);                    \
1783         ret;                                    \
1784         SET_SIZE(NAME)
1785 
1786         SUWORD_NOERR(suword64_noerr, movq, %rsi)
1787         SUWORD_NOERR(suword32_noerr, movl, %esi)
1788         SUWORD_NOERR(suword16_noerr, movw, %si)
1789         SUWORD_NOERR(suword8_noerr, movb, %sil)
1790 
1791 #undef  SUWORD_NOERR
1792 
1793 
1794         .weak   subyte
1795         subyte=suword8
1796         .weak   subyte_noerr
1797         subyte_noerr=suword8_noerr
1798 
1799         .weak   fulword
1800         fulword=fuword64
1801         .weak   fulword_noerr
1802         fulword_noerr=fuword64_noerr
1803         .weak   sulword
1804         sulword=suword64
1805         .weak   sulword_noerr
1806         sulword_noerr=suword64_noerr
1807 
1808         ENTRY(copyin_noerr)
1809         movq    kernelbase(%rip), %rax
1810 #ifdef DEBUG
1811         cmpq    %rax, %rsi              /* %rsi = kto */
1812         jae     1f
1813         leaq    .cpyin_ne_pmsg(%rip), %rdi
1814         jmp     call_panic              /* setup stack and call panic */
1815 1:
1816 #endif
1817         cmpq    %rax, %rdi              /* ufrom < kernelbase */
1818         jb      do_copy
1819         movq    %rax, %rdi              /* force fault at kernelbase */
1820         jmp     do_copy
1821         SET_SIZE(copyin_noerr)
1822 
1823         ENTRY(copyout_noerr)
1824         movq    kernelbase(%rip), %rax
1825 #ifdef DEBUG
1826         cmpq    %rax, %rdi              /* %rdi = kfrom */
1827         jae     1f
1828         leaq    .cpyout_ne_pmsg(%rip), %rdi
1829         jmp     call_panic              /* setup stack and call panic */
1830 1:
1831 #endif
1832         cmpq    %rax, %rsi              /* uto < kernelbase */
1833         jb      do_copy
1834         movq    %rax, %rsi              /* force fault at kernelbase */
1835         jmp     do_copy
1836         SET_SIZE(copyout_noerr)
1837 
1838         ENTRY(uzero)
1839         movq    kernelbase(%rip), %rax
1840         cmpq    %rax, %rdi
1841         jb      do_zero
1842         movq    %rax, %rdi      /* force fault at kernelbase */
1843         jmp     do_zero
1844         SET_SIZE(uzero)
1845 
1846         ENTRY(ucopy)
1847         movq    kernelbase(%rip), %rax
1848         cmpq    %rax, %rdi
1849         cmovaeq %rax, %rdi      /* force fault at kernelbase */
1850         cmpq    %rax, %rsi
1851         cmovaeq %rax, %rsi      /* force fault at kernelbase */
1852         jmp     do_copy
1853         SET_SIZE(ucopy)
1854 
1855         /*
1856          * Note, the frame pointer is required here becuase do_copystr expects
1857          * to be able to pop it off!
1858          */
1859         ENTRY(ucopystr)
1860         pushq   %rbp
1861         movq    %rsp, %rbp
1862         movq    kernelbase(%rip), %rax
1863         cmpq    %rax, %rdi
1864         cmovaeq %rax, %rdi      /* force fault at kernelbase */
1865         cmpq    %rax, %rsi
1866         cmovaeq %rax, %rsi      /* force fault at kernelbase */
1867         /* do_copystr expects lofault address in %r8 */
1868         /* do_copystr expects whether or not we need smap in %r10 */
1869         xorl    %r10d, %r10d
1870         movq    %gs:CPU_THREAD, %r8
1871         movq    T_LOFAULT(%r8), %r8
1872         jmp     do_copystr
1873         SET_SIZE(ucopystr)
1874 
1875 #ifdef DEBUG
1876         .data
1877 .kcopy_panic_msg:
1878         .string "kcopy: arguments below kernelbase"
1879 .bcopy_panic_msg:
1880         .string "bcopy: arguments below kernelbase"
1881 .kzero_panic_msg:
1882         .string "kzero: arguments below kernelbase"
1883 .bzero_panic_msg:
1884         .string "bzero: arguments below kernelbase"
1885 .copyin_panic_msg:
1886         .string "copyin: kaddr argument below kernelbase"
1887 .xcopyin_panic_msg:
1888         .string "xcopyin: kaddr argument below kernelbase"
1889 .copyout_panic_msg:
1890         .string "copyout: kaddr argument below kernelbase"
1891 .xcopyout_panic_msg:
1892         .string "xcopyout: kaddr argument below kernelbase"
1893 .copystr_panic_msg:
1894         .string "copystr: arguments in user space"
1895 .copyinstr_panic_msg:
1896         .string "copyinstr: kaddr argument not in kernel address space"
1897 .copyoutstr_panic_msg:
1898         .string "copyoutstr: kaddr argument not in kernel address space"
1899 .cpyin_ne_pmsg:
1900         .string "copyin_noerr: argument not in kernel address space"
1901 .cpyout_ne_pmsg:
1902         .string "copyout_noerr: argument not in kernel address space"
1903 #endif
1904 
1905 /*
1906  * These functions are used for SMAP, supervisor mode access protection. They
1907  * are hotpatched to become real instructions when the system starts up which is
1908  * done in mlsetup() as a part of enabling the other CR4 related features.
1909  *
1910  * Generally speaking, smap_disable() is a stac instruction and smap_enable is a
1911  * clac instruction. It's safe to call these any number of times, and in fact,
1912  * out of paranoia, the kernel will likely call it at several points.
1913  */
1914 
1915         ENTRY(smap_disable)
1916         nop
1917         nop
1918         nop
1919         ret
1920         SET_SIZE(smap_disable)
1921 
1922         ENTRY(smap_enable)
1923         nop
1924         nop
1925         nop
1926         ret
1927         SET_SIZE(smap_enable)
1928 
1929 .data
1930 .align  4
1931 .globl  _smap_enable_patch_count
1932 .type   _smap_enable_patch_count,@object
1933 .size   _smap_enable_patch_count, 4
1934 _smap_enable_patch_count:
1935         .long   SMAP_ENABLE_COUNT
1936 
1937 .globl  _smap_disable_patch_count
1938 .type   _smap_disable_patch_count,@object
1939 .size   _smap_disable_patch_count, 4
1940 _smap_disable_patch_count:
1941         .long SMAP_DISABLE_COUNT