1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2009, Intel Corporation
  28  * All rights reserved.
  29  */
  30 
  31 /*       Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.        */
  32 /*       Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T              */
  33 /*         All Rights Reserved                                          */
  34 
  35 /*       Copyright (c) 1987, 1988 Microsoft Corporation                 */
  36 /*         All Rights Reserved                                          */
  37 
  38 /*
  39  * Copyright 2020 Joyent, Inc.
  40  */
  41 
  42 #include <sys/errno.h>
  43 #include <sys/asm_linkage.h>
  44 
  45 #include "assym.h"
  46 
  47 #define KCOPY_MIN_SIZE  128     /* Must be >= 16 bytes */
  48 #define XCOPY_MIN_SIZE  128     /* Must be >= 16 bytes */
  49 /*
  50  * Non-temopral access (NTA) alignment requirement
  51  */
  52 #define NTA_ALIGN_SIZE  4       /* Must be at least 4-byte aligned */
  53 #define NTA_ALIGN_MASK  _CONST(NTA_ALIGN_SIZE-1)
  54 #define COUNT_ALIGN_SIZE        16      /* Must be at least 16-byte aligned */
  55 #define COUNT_ALIGN_MASK        _CONST(COUNT_ALIGN_SIZE-1)
  56 
  57 /*
  58  * With the introduction of Broadwell, Intel has introduced supervisor mode
  59  * access protection -- SMAP. SMAP forces the kernel to set certain bits to
  60  * enable access of user pages (AC in rflags, defines as PS_ACHK in
  61  * <sys/psw.h>). One of the challenges is that the implementation of many of the
  62  * userland copy routines directly use the kernel ones. For example, copyin and
  63  * copyout simply go and jump to the do_copy_fault label and traditionally let
  64  * those deal with the return for them. In fact, changing that is a can of frame
  65  * pointers.
  66  *
  67  * Rules and Constraints:
  68  *
  69  * 1. For anything that's not in copy.s, we have it do explicit smap_disable()
  70  * or smap_enable() calls.  This is restricted to the following three places:
  71  * DTrace, resume() in swtch.s and on_fault/no_fault. If you want to add it
  72  * somewhere else, we should be thinking twice.
  73  *
  74  * 2. We try to toggle this at the smallest window possible. This means that if
  75  * we take a fault, need to try to use a copyop in copyin() or copyout(), or any
  76  * other function, we will always leave with SMAP enabled (the kernel cannot
  77  * access user pages).
  78  *
  79  * 3. None of the *_noerr() or ucopy/uzero routines should toggle SMAP. They are
  80  * explicitly only allowed to be called while in an on_fault()/no_fault() handler,
  81  * which already takes care of ensuring that SMAP is enabled and disabled. Note
  82  * this means that when under an on_fault()/no_fault() handler, one must not
  83  * call the non-*_noerr() routines.
  84  *
  85  * 4. The first thing we should do after coming out of an lofault handler is to
  86  * make sure that we call smap_enable() again to ensure that we are safely
  87  * protected, as more often than not, we will have disabled smap to get there.
  88  *
  89  * 5. smap_enable() and smap_disable() don't exist: calls to these functions
  90  * generate runtime relocations, that are then processed into the necessary
  91  * clac/stac, via the krtld hotinlines mechanism and hotinline_smap().
  92  *
  93  * 6. For any inline user of SMAP, the appropriate SMAP_ENABLE_INSTR and
  94  * SMAP_DISABLE_INSTR macro should be used. If the number of these is changed,
  95  * you must update the constants SMAP_ENABLE_COUNT and SMAP_DISABLE_COUNT below.
  96  *
  97  * 7. Generally this .s file is processed by a K&R style cpp. This means that it
  98  * really has a lot of feelings about whitespace. In particular, if you have a
  99  * macro FOO with the arguments FOO(1, 3), the second argument is in fact ' 3'.
 100  *
 101  * 8. In general, the kernel has its own value for rflags that gets used. This
 102  * is maintained in a few different places which vary based on how the thread
 103  * comes into existence and whether it's a user thread. In general, when the
 104  * kernel takes a trap, it always will set ourselves to a known set of flags,
 105  * mainly as part of ENABLE_INTR_FLAGS and F_OFF and F_ON. These ensure that
 106  * PS_ACHK is cleared for us. In addition, when using the sysenter instruction,
 107  * we mask off PS_ACHK off via the AMD_SFMASK MSR. See init_cpu_syscall() for
 108  * where that gets masked off.
 109  */
 110 
 111 /*
 112  * The optimal 64-bit bcopy and kcopy for modern x86 processors uses
 113  * "rep smovq" for large sizes. Performance data shows that many calls to
 114  * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for
 115  * these small sizes unrolled code is used. For medium sizes loops writing
 116  * 64-bytes per loop are used. Transition points were determined experimentally.
 117  */
 118 #define BZERO_USE_REP   (1024)
 119 #define BCOPY_DFLT_REP  (128)
 120 #define BCOPY_NHM_REP   (768)
 121 
 122 /*
 123  * Copy a block of storage, returning an error code if `from' or
 124  * `to' takes a kernel pagefault which cannot be resolved.
 125  * Returns errno value on pagefault error, 0 if all ok
 126  */
 127 
 128 /*
 129  * I'm sorry about these macros, but copy.s is unsurprisingly sensitive to
 130  * additional call instructions.
 131  */
 132 #define SMAP_DISABLE_COUNT      16
 133 #define SMAP_ENABLE_COUNT       26
 134 
 135 #define SMAP_DISABLE_INSTR(ITER)                \
 136         .globl  _smap_disable_patch_/**/ITER;   \
 137         _smap_disable_patch_/**/ITER/**/:;      \
 138         nop; nop; nop;
 139 
 140 #define SMAP_ENABLE_INSTR(ITER)                 \
 141         .globl  _smap_enable_patch_/**/ITER;    \
 142         _smap_enable_patch_/**/ITER/**/:;       \
 143         nop; nop; nop;
 144 
 145         .globl  kernelbase
 146         .globl  postbootkernelbase
 147 
 148         ENTRY(kcopy)
 149         pushq   %rbp
 150         movq    %rsp, %rbp
 151 #ifdef DEBUG
 152         cmpq    postbootkernelbase(%rip), %rdi          /* %rdi = from */
 153         jb      0f
 154         cmpq    postbootkernelbase(%rip), %rsi          /* %rsi = to */
 155         jnb     1f
 156 0:      leaq    .kcopy_panic_msg(%rip), %rdi
 157         xorl    %eax, %eax
 158         call    panic
 159 1:
 160 #endif
 161         /*
 162          * pass lofault value as 4th argument to do_copy_fault
 163          */
 164         leaq    _kcopy_copyerr(%rip), %rcx
 165         movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
 166 
 167 do_copy_fault:
 168         movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
 169         movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
 170         call    bcopy_altentry
 171         xorl    %eax, %eax              /* return 0 (success) */
 172         SMAP_ENABLE_INSTR(0)
 173 
 174         /*
 175          * A fault during do_copy_fault is indicated through an errno value
 176          * in %rax and we iretq from the trap handler to here.
 177          */
 178 _kcopy_copyerr:
 179         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
 180         leave
 181         ret
 182         SET_SIZE(kcopy)
 183 
 184 #undef  ARG_FROM
 185 #undef  ARG_TO
 186 #undef  ARG_COUNT
 187 
 188 #define COPY_LOOP_INIT(src, dst, cnt)   \
 189         addq    cnt, src;                       \
 190         addq    cnt, dst;                       \
 191         shrq    $3, cnt;                        \
 192         neg     cnt
 193 
 194         /* Copy 16 bytes per loop.  Uses %rax and %r8 */
 195 #define COPY_LOOP_BODY(src, dst, cnt)   \
 196         prefetchnta     0x100(src, cnt, 8);     \
 197         movq    (src, cnt, 8), %rax;            \
 198         movq    0x8(src, cnt, 8), %r8;          \
 199         movnti  %rax, (dst, cnt, 8);            \
 200         movnti  %r8, 0x8(dst, cnt, 8);          \
 201         addq    $2, cnt
 202 
 203         ENTRY(kcopy_nta)
 204         pushq   %rbp
 205         movq    %rsp, %rbp
 206 #ifdef DEBUG
 207         cmpq    postbootkernelbase(%rip), %rdi          /* %rdi = from */
 208         jb      0f
 209         cmpq    postbootkernelbase(%rip), %rsi          /* %rsi = to */
 210         jnb     1f
 211 0:      leaq    .kcopy_panic_msg(%rip), %rdi
 212         xorl    %eax, %eax
 213         call    panic
 214 1:
 215 #endif
 216 
 217         movq    %gs:CPU_THREAD, %r9
 218         cmpq    $0, %rcx                /* No non-temporal access? */
 219         /*
 220          * pass lofault value as 4th argument to do_copy_fault
 221          */
 222         leaq    _kcopy_nta_copyerr(%rip), %rcx  /* doesn't set rflags */
 223         jnz     do_copy_fault           /* use regular access */
 224         /*
 225          * Make sure cnt is >= KCOPY_MIN_SIZE
 226          */
 227         cmpq    $KCOPY_MIN_SIZE, %rdx
 228         jb      do_copy_fault
 229 
 230         /*
 231          * Make sure src and dst are NTA_ALIGN_SIZE aligned,
 232          * count is COUNT_ALIGN_SIZE aligned.
 233          */
 234         movq    %rdi, %r10
 235         orq     %rsi, %r10
 236         andq    $NTA_ALIGN_MASK, %r10
 237         orq     %rdx, %r10
 238         andq    $COUNT_ALIGN_MASK, %r10
 239         jnz     do_copy_fault
 240 
 241         ALTENTRY(do_copy_fault_nta)
 242         movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
 243         movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
 244         movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
 245 
 246         /*
 247          * COPY_LOOP_BODY uses %rax and %r8
 248          */
 249         COPY_LOOP_INIT(%rdi, %rsi, %rdx)
 250 2:      COPY_LOOP_BODY(%rdi, %rsi, %rdx)
 251         jnz     2b
 252 
 253         mfence
 254         xorl    %eax, %eax              /* return 0 (success) */
 255         SMAP_ENABLE_INSTR(1)
 256 
 257 _kcopy_nta_copyerr:
 258         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
 259         leave
 260         ret
 261         SET_SIZE(do_copy_fault_nta)
 262         SET_SIZE(kcopy_nta)
 263 
 264         ENTRY(bcopy)
 265 #ifdef DEBUG
 266         orq     %rdx, %rdx              /* %rdx = count */
 267         jz      1f
 268         cmpq    postbootkernelbase(%rip), %rdi          /* %rdi = from */
 269         jb      0f
 270         cmpq    postbootkernelbase(%rip), %rsi          /* %rsi = to */
 271         jnb     1f
 272 0:      leaq    .bcopy_panic_msg(%rip), %rdi
 273         jmp     call_panic              /* setup stack and call panic */
 274 1:
 275 #endif
 276         /*
 277          * bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
 278          * kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
 279          * uses these registers in future they must be saved and restored.
 280          */
 281         ALTENTRY(bcopy_altentry)
 282 do_copy:
 283 #define L(s) .bcopy/**/s
 284         cmpq    $0x50, %rdx             /* 80 */
 285         jae     bcopy_ck_size
 286 
 287         /*
 288          * Performance data shows many caller's copy small buffers. So for
 289          * best perf for these sizes unrolled code is used. Store data without
 290          * worrying about alignment.
 291          */
 292         leaq    L(fwdPxQx)(%rip), %r10
 293         addq    %rdx, %rdi
 294         addq    %rdx, %rsi
 295         movslq  (%r10,%rdx,4), %rcx
 296         leaq    (%rcx,%r10,1), %r10
 297         INDIRECT_JMP_REG(r10)
 298 
 299         .p2align 4
 300 L(fwdPxQx):
 301         .int       L(P0Q0)-L(fwdPxQx)   /* 0 */
 302         .int       L(P1Q0)-L(fwdPxQx)
 303         .int       L(P2Q0)-L(fwdPxQx)
 304         .int       L(P3Q0)-L(fwdPxQx)
 305         .int       L(P4Q0)-L(fwdPxQx)
 306         .int       L(P5Q0)-L(fwdPxQx)
 307         .int       L(P6Q0)-L(fwdPxQx)
 308         .int       L(P7Q0)-L(fwdPxQx)
 309 
 310         .int       L(P0Q1)-L(fwdPxQx)   /* 8 */
 311         .int       L(P1Q1)-L(fwdPxQx)
 312         .int       L(P2Q1)-L(fwdPxQx)
 313         .int       L(P3Q1)-L(fwdPxQx)
 314         .int       L(P4Q1)-L(fwdPxQx)
 315         .int       L(P5Q1)-L(fwdPxQx)
 316         .int       L(P6Q1)-L(fwdPxQx)
 317         .int       L(P7Q1)-L(fwdPxQx)
 318 
 319         .int       L(P0Q2)-L(fwdPxQx)   /* 16 */
 320         .int       L(P1Q2)-L(fwdPxQx)
 321         .int       L(P2Q2)-L(fwdPxQx)
 322         .int       L(P3Q2)-L(fwdPxQx)
 323         .int       L(P4Q2)-L(fwdPxQx)
 324         .int       L(P5Q2)-L(fwdPxQx)
 325         .int       L(P6Q2)-L(fwdPxQx)
 326         .int       L(P7Q2)-L(fwdPxQx)
 327 
 328         .int       L(P0Q3)-L(fwdPxQx)   /* 24 */
 329         .int       L(P1Q3)-L(fwdPxQx)
 330         .int       L(P2Q3)-L(fwdPxQx)
 331         .int       L(P3Q3)-L(fwdPxQx)
 332         .int       L(P4Q3)-L(fwdPxQx)
 333         .int       L(P5Q3)-L(fwdPxQx)
 334         .int       L(P6Q3)-L(fwdPxQx)
 335         .int       L(P7Q3)-L(fwdPxQx)
 336 
 337         .int       L(P0Q4)-L(fwdPxQx)   /* 32 */
 338         .int       L(P1Q4)-L(fwdPxQx)
 339         .int       L(P2Q4)-L(fwdPxQx)
 340         .int       L(P3Q4)-L(fwdPxQx)
 341         .int       L(P4Q4)-L(fwdPxQx)
 342         .int       L(P5Q4)-L(fwdPxQx)
 343         .int       L(P6Q4)-L(fwdPxQx)
 344         .int       L(P7Q4)-L(fwdPxQx)
 345 
 346         .int       L(P0Q5)-L(fwdPxQx)   /* 40 */
 347         .int       L(P1Q5)-L(fwdPxQx)
 348         .int       L(P2Q5)-L(fwdPxQx)
 349         .int       L(P3Q5)-L(fwdPxQx)
 350         .int       L(P4Q5)-L(fwdPxQx)
 351         .int       L(P5Q5)-L(fwdPxQx)
 352         .int       L(P6Q5)-L(fwdPxQx)
 353         .int       L(P7Q5)-L(fwdPxQx)
 354 
 355         .int       L(P0Q6)-L(fwdPxQx)   /* 48 */
 356         .int       L(P1Q6)-L(fwdPxQx)
 357         .int       L(P2Q6)-L(fwdPxQx)
 358         .int       L(P3Q6)-L(fwdPxQx)
 359         .int       L(P4Q6)-L(fwdPxQx)
 360         .int       L(P5Q6)-L(fwdPxQx)
 361         .int       L(P6Q6)-L(fwdPxQx)
 362         .int       L(P7Q6)-L(fwdPxQx)
 363 
 364         .int       L(P0Q7)-L(fwdPxQx)   /* 56 */
 365         .int       L(P1Q7)-L(fwdPxQx)
 366         .int       L(P2Q7)-L(fwdPxQx)
 367         .int       L(P3Q7)-L(fwdPxQx)
 368         .int       L(P4Q7)-L(fwdPxQx)
 369         .int       L(P5Q7)-L(fwdPxQx)
 370         .int       L(P6Q7)-L(fwdPxQx)
 371         .int       L(P7Q7)-L(fwdPxQx)
 372 
 373         .int       L(P0Q8)-L(fwdPxQx)   /* 64 */
 374         .int       L(P1Q8)-L(fwdPxQx)
 375         .int       L(P2Q8)-L(fwdPxQx)
 376         .int       L(P3Q8)-L(fwdPxQx)
 377         .int       L(P4Q8)-L(fwdPxQx)
 378         .int       L(P5Q8)-L(fwdPxQx)
 379         .int       L(P6Q8)-L(fwdPxQx)
 380         .int       L(P7Q8)-L(fwdPxQx)
 381 
 382         .int       L(P0Q9)-L(fwdPxQx)   /* 72 */
 383         .int       L(P1Q9)-L(fwdPxQx)
 384         .int       L(P2Q9)-L(fwdPxQx)
 385         .int       L(P3Q9)-L(fwdPxQx)
 386         .int       L(P4Q9)-L(fwdPxQx)
 387         .int       L(P5Q9)-L(fwdPxQx)
 388         .int       L(P6Q9)-L(fwdPxQx)
 389         .int       L(P7Q9)-L(fwdPxQx)   /* 79 */
 390 
 391         .p2align 4
 392 L(P0Q9):
 393         mov    -0x48(%rdi), %rcx
 394         mov    %rcx, -0x48(%rsi)
 395 L(P0Q8):
 396         mov    -0x40(%rdi), %r10
 397         mov    %r10, -0x40(%rsi)
 398 L(P0Q7):
 399         mov    -0x38(%rdi), %r8
 400         mov    %r8, -0x38(%rsi)
 401 L(P0Q6):
 402         mov    -0x30(%rdi), %rcx
 403         mov    %rcx, -0x30(%rsi)
 404 L(P0Q5):
 405         mov    -0x28(%rdi), %r10
 406         mov    %r10, -0x28(%rsi)
 407 L(P0Q4):
 408         mov    -0x20(%rdi), %r8
 409         mov    %r8, -0x20(%rsi)
 410 L(P0Q3):
 411         mov    -0x18(%rdi), %rcx
 412         mov    %rcx, -0x18(%rsi)
 413 L(P0Q2):
 414         mov    -0x10(%rdi), %r10
 415         mov    %r10, -0x10(%rsi)
 416 L(P0Q1):
 417         mov    -0x8(%rdi), %r8
 418         mov    %r8, -0x8(%rsi)
 419 L(P0Q0):
 420         ret
 421 
 422         .p2align 4
 423 L(P1Q9):
 424         mov    -0x49(%rdi), %r8
 425         mov    %r8, -0x49(%rsi)
 426 L(P1Q8):
 427         mov    -0x41(%rdi), %rcx
 428         mov    %rcx, -0x41(%rsi)
 429 L(P1Q7):
 430         mov    -0x39(%rdi), %r10
 431         mov    %r10, -0x39(%rsi)
 432 L(P1Q6):
 433         mov    -0x31(%rdi), %r8
 434         mov    %r8, -0x31(%rsi)
 435 L(P1Q5):
 436         mov    -0x29(%rdi), %rcx
 437         mov    %rcx, -0x29(%rsi)
 438 L(P1Q4):
 439         mov    -0x21(%rdi), %r10
 440         mov    %r10, -0x21(%rsi)
 441 L(P1Q3):
 442         mov    -0x19(%rdi), %r8
 443         mov    %r8, -0x19(%rsi)
 444 L(P1Q2):
 445         mov    -0x11(%rdi), %rcx
 446         mov    %rcx, -0x11(%rsi)
 447 L(P1Q1):
 448         mov    -0x9(%rdi), %r10
 449         mov    %r10, -0x9(%rsi)
 450 L(P1Q0):
 451         movzbq -0x1(%rdi), %r8
 452         mov    %r8b, -0x1(%rsi)
 453         ret
 454 
 455         .p2align 4
 456 L(P2Q9):
 457         mov    -0x4a(%rdi), %r8
 458         mov    %r8, -0x4a(%rsi)
 459 L(P2Q8):
 460         mov    -0x42(%rdi), %rcx
 461         mov    %rcx, -0x42(%rsi)
 462 L(P2Q7):
 463         mov    -0x3a(%rdi), %r10
 464         mov    %r10, -0x3a(%rsi)
 465 L(P2Q6):
 466         mov    -0x32(%rdi), %r8
 467         mov    %r8, -0x32(%rsi)
 468 L(P2Q5):
 469         mov    -0x2a(%rdi), %rcx
 470         mov    %rcx, -0x2a(%rsi)
 471 L(P2Q4):
 472         mov    -0x22(%rdi), %r10
 473         mov    %r10, -0x22(%rsi)
 474 L(P2Q3):
 475         mov    -0x1a(%rdi), %r8
 476         mov    %r8, -0x1a(%rsi)
 477 L(P2Q2):
 478         mov    -0x12(%rdi), %rcx
 479         mov    %rcx, -0x12(%rsi)
 480 L(P2Q1):
 481         mov    -0xa(%rdi), %r10
 482         mov    %r10, -0xa(%rsi)
 483 L(P2Q0):
 484         movzwq -0x2(%rdi), %r8
 485         mov    %r8w, -0x2(%rsi)
 486         ret
 487 
 488         .p2align 4
 489 L(P3Q9):
 490         mov    -0x4b(%rdi), %r8
 491         mov    %r8, -0x4b(%rsi)
 492 L(P3Q8):
 493         mov    -0x43(%rdi), %rcx
 494         mov    %rcx, -0x43(%rsi)
 495 L(P3Q7):
 496         mov    -0x3b(%rdi), %r10
 497         mov    %r10, -0x3b(%rsi)
 498 L(P3Q6):
 499         mov    -0x33(%rdi), %r8
 500         mov    %r8, -0x33(%rsi)
 501 L(P3Q5):
 502         mov    -0x2b(%rdi), %rcx
 503         mov    %rcx, -0x2b(%rsi)
 504 L(P3Q4):
 505         mov    -0x23(%rdi), %r10
 506         mov    %r10, -0x23(%rsi)
 507 L(P3Q3):
 508         mov    -0x1b(%rdi), %r8
 509         mov    %r8, -0x1b(%rsi)
 510 L(P3Q2):
 511         mov    -0x13(%rdi), %rcx
 512         mov    %rcx, -0x13(%rsi)
 513 L(P3Q1):
 514         mov    -0xb(%rdi), %r10
 515         mov    %r10, -0xb(%rsi)
 516         /*
 517          * These trailing loads/stores have to do all their loads 1st,
 518          * then do the stores.
 519          */
 520 L(P3Q0):
 521         movzwq -0x3(%rdi), %r8
 522         movzbq -0x1(%rdi), %r10
 523         mov    %r8w, -0x3(%rsi)
 524         mov    %r10b, -0x1(%rsi)
 525         ret
 526 
 527         .p2align 4
 528 L(P4Q9):
 529         mov    -0x4c(%rdi), %r8
 530         mov    %r8, -0x4c(%rsi)
 531 L(P4Q8):
 532         mov    -0x44(%rdi), %rcx
 533         mov    %rcx, -0x44(%rsi)
 534 L(P4Q7):
 535         mov    -0x3c(%rdi), %r10
 536         mov    %r10, -0x3c(%rsi)
 537 L(P4Q6):
 538         mov    -0x34(%rdi), %r8
 539         mov    %r8, -0x34(%rsi)
 540 L(P4Q5):
 541         mov    -0x2c(%rdi), %rcx
 542         mov    %rcx, -0x2c(%rsi)
 543 L(P4Q4):
 544         mov    -0x24(%rdi), %r10
 545         mov    %r10, -0x24(%rsi)
 546 L(P4Q3):
 547         mov    -0x1c(%rdi), %r8
 548         mov    %r8, -0x1c(%rsi)
 549 L(P4Q2):
 550         mov    -0x14(%rdi), %rcx
 551         mov    %rcx, -0x14(%rsi)
 552 L(P4Q1):
 553         mov    -0xc(%rdi), %r10
 554         mov    %r10, -0xc(%rsi)
 555 L(P4Q0):
 556         mov    -0x4(%rdi), %r8d
 557         mov    %r8d, -0x4(%rsi)
 558         ret
 559 
 560         .p2align 4
 561 L(P5Q9):
 562         mov    -0x4d(%rdi), %r8
 563         mov    %r8, -0x4d(%rsi)
 564 L(P5Q8):
 565         mov    -0x45(%rdi), %rcx
 566         mov    %rcx, -0x45(%rsi)
 567 L(P5Q7):
 568         mov    -0x3d(%rdi), %r10
 569         mov    %r10, -0x3d(%rsi)
 570 L(P5Q6):
 571         mov    -0x35(%rdi), %r8
 572         mov    %r8, -0x35(%rsi)
 573 L(P5Q5):
 574         mov    -0x2d(%rdi), %rcx
 575         mov    %rcx, -0x2d(%rsi)
 576 L(P5Q4):
 577         mov    -0x25(%rdi), %r10
 578         mov    %r10, -0x25(%rsi)
 579 L(P5Q3):
 580         mov    -0x1d(%rdi), %r8
 581         mov    %r8, -0x1d(%rsi)
 582 L(P5Q2):
 583         mov    -0x15(%rdi), %rcx
 584         mov    %rcx, -0x15(%rsi)
 585 L(P5Q1):
 586         mov    -0xd(%rdi), %r10
 587         mov    %r10, -0xd(%rsi)
 588 L(P5Q0):
 589         mov    -0x5(%rdi), %r8d
 590         movzbq -0x1(%rdi), %r10
 591         mov    %r8d, -0x5(%rsi)
 592         mov    %r10b, -0x1(%rsi)
 593         ret
 594 
 595         .p2align 4
 596 L(P6Q9):
 597         mov    -0x4e(%rdi), %r8
 598         mov    %r8, -0x4e(%rsi)
 599 L(P6Q8):
 600         mov    -0x46(%rdi), %rcx
 601         mov    %rcx, -0x46(%rsi)
 602 L(P6Q7):
 603         mov    -0x3e(%rdi), %r10
 604         mov    %r10, -0x3e(%rsi)
 605 L(P6Q6):
 606         mov    -0x36(%rdi), %r8
 607         mov    %r8, -0x36(%rsi)
 608 L(P6Q5):
 609         mov    -0x2e(%rdi), %rcx
 610         mov    %rcx, -0x2e(%rsi)
 611 L(P6Q4):
 612         mov    -0x26(%rdi), %r10
 613         mov    %r10, -0x26(%rsi)
 614 L(P6Q3):
 615         mov    -0x1e(%rdi), %r8
 616         mov    %r8, -0x1e(%rsi)
 617 L(P6Q2):
 618         mov    -0x16(%rdi), %rcx
 619         mov    %rcx, -0x16(%rsi)
 620 L(P6Q1):
 621         mov    -0xe(%rdi), %r10
 622         mov    %r10, -0xe(%rsi)
 623 L(P6Q0):
 624         mov    -0x6(%rdi), %r8d
 625         movzwq -0x2(%rdi), %r10
 626         mov    %r8d, -0x6(%rsi)
 627         mov    %r10w, -0x2(%rsi)
 628         ret
 629 
 630         .p2align 4
 631 L(P7Q9):
 632         mov    -0x4f(%rdi), %r8
 633         mov    %r8, -0x4f(%rsi)
 634 L(P7Q8):
 635         mov    -0x47(%rdi), %rcx
 636         mov    %rcx, -0x47(%rsi)
 637 L(P7Q7):
 638         mov    -0x3f(%rdi), %r10
 639         mov    %r10, -0x3f(%rsi)
 640 L(P7Q6):
 641         mov    -0x37(%rdi), %r8
 642         mov    %r8, -0x37(%rsi)
 643 L(P7Q5):
 644         mov    -0x2f(%rdi), %rcx
 645         mov    %rcx, -0x2f(%rsi)
 646 L(P7Q4):
 647         mov    -0x27(%rdi), %r10
 648         mov    %r10, -0x27(%rsi)
 649 L(P7Q3):
 650         mov    -0x1f(%rdi), %r8
 651         mov    %r8, -0x1f(%rsi)
 652 L(P7Q2):
 653         mov    -0x17(%rdi), %rcx
 654         mov    %rcx, -0x17(%rsi)
 655 L(P7Q1):
 656         mov    -0xf(%rdi), %r10
 657         mov    %r10, -0xf(%rsi)
 658 L(P7Q0):
 659         mov    -0x7(%rdi), %r8d
 660         movzwq -0x3(%rdi), %r10
 661         movzbq -0x1(%rdi), %rcx
 662         mov    %r8d, -0x7(%rsi)
 663         mov    %r10w, -0x3(%rsi)
 664         mov    %cl, -0x1(%rsi)
 665         ret
 666 
 667         /*
 668          * For large sizes rep smovq is fastest.
 669          * Transition point determined experimentally as measured on
 670          * Intel Xeon processors (incl. Nehalem and previous generations) and
 671          * AMD Opteron. The transition value is patched at boot time to avoid
 672          * memory reference hit.
 673          */
 674         .globl bcopy_patch_start
 675 bcopy_patch_start:
 676         cmpq    $BCOPY_NHM_REP, %rdx
 677         .globl bcopy_patch_end
 678 bcopy_patch_end:
 679 
 680         .p2align 4
 681         ALTENTRY(bcopy_ck_size)
 682 
 683         cmpq    $BCOPY_DFLT_REP, %rdx
 684         jae     L(use_rep)
 685 
 686         /*
 687          * Align to a 8-byte boundary. Avoids penalties from unaligned stores
 688          * as well as from stores spanning cachelines.
 689          */
 690         test    $0x7, %rsi
 691         jz      L(aligned_loop)
 692         test    $0x1, %rsi
 693         jz      2f
 694         movzbq  (%rdi), %r8
 695         dec     %rdx
 696         inc     %rdi
 697         mov     %r8b, (%rsi)
 698         inc     %rsi
 699 2:
 700         test    $0x2, %rsi
 701         jz      4f
 702         movzwq  (%rdi), %r8
 703         sub     $0x2, %rdx
 704         add     $0x2, %rdi
 705         mov     %r8w, (%rsi)
 706         add     $0x2, %rsi
 707 4:
 708         test    $0x4, %rsi
 709         jz      L(aligned_loop)
 710         mov     (%rdi), %r8d
 711         sub     $0x4, %rdx
 712         add     $0x4, %rdi
 713         mov     %r8d, (%rsi)
 714         add     $0x4, %rsi
 715 
 716         /*
 717          * Copy 64-bytes per loop
 718          */
 719         .p2align 4
 720 L(aligned_loop):
 721         mov     (%rdi), %r8
 722         mov     0x8(%rdi), %r10
 723         lea     -0x40(%rdx), %rdx
 724         mov     %r8, (%rsi)
 725         mov     %r10, 0x8(%rsi)
 726         mov     0x10(%rdi), %rcx
 727         mov     0x18(%rdi), %r8
 728         mov     %rcx, 0x10(%rsi)
 729         mov     %r8, 0x18(%rsi)
 730 
 731         cmp     $0x40, %rdx
 732         mov     0x20(%rdi), %r10
 733         mov     0x28(%rdi), %rcx
 734         mov     %r10, 0x20(%rsi)
 735         mov     %rcx, 0x28(%rsi)
 736         mov     0x30(%rdi), %r8
 737         mov     0x38(%rdi), %r10
 738         lea     0x40(%rdi), %rdi
 739         mov     %r8, 0x30(%rsi)
 740         mov     %r10, 0x38(%rsi)
 741         lea     0x40(%rsi), %rsi
 742         jae     L(aligned_loop)
 743 
 744         /*
 745          * Copy remaining bytes (0-63)
 746          */
 747 L(do_remainder):
 748         leaq    L(fwdPxQx)(%rip), %r10
 749         addq    %rdx, %rdi
 750         addq    %rdx, %rsi
 751         movslq  (%r10,%rdx,4), %rcx
 752         leaq    (%rcx,%r10,1), %r10
 753         INDIRECT_JMP_REG(r10)
 754 
 755         /*
 756          * Use rep smovq. Clear remainder via unrolled code
 757          */
 758         .p2align 4
 759 L(use_rep):
 760         xchgq   %rdi, %rsi              /* %rsi = source, %rdi = destination */
 761         movq    %rdx, %rcx              /* %rcx = count */
 762         shrq    $3, %rcx                /* 8-byte word count */
 763         rep
 764           smovq
 765 
 766         xchgq   %rsi, %rdi              /* %rdi = src, %rsi = destination */
 767         andq    $7, %rdx                /* remainder */
 768         jnz     L(do_remainder)
 769         ret
 770 #undef  L
 771         SET_SIZE(bcopy_ck_size)
 772 
 773 #ifdef DEBUG
 774         /*
 775          * Setup frame on the run-time stack. The end of the input argument
 776          * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
 777          * always points to the end of the latest allocated stack frame.
 778          * panic(const char *format, ...) is a varargs function. When a
 779          * function taking variable arguments is called, %rax must be set
 780          * to eight times the number of floating point parameters passed
 781          * to the function in SSE registers.
 782          */
 783 call_panic:
 784         pushq   %rbp                    /* align stack properly */
 785         movq    %rsp, %rbp
 786         xorl    %eax, %eax              /* no variable arguments */
 787         call    panic                   /* %rdi = format string */
 788 #endif
 789         SET_SIZE(bcopy_altentry)
 790         SET_SIZE(bcopy)
 791 
 792 
 793 /*
 794  * Zero a block of storage, returning an error code if we
 795  * take a kernel pagefault which cannot be resolved.
 796  * Returns errno value on pagefault error, 0 if all ok
 797  */
 798 
 799         ENTRY(kzero)
 800 #ifdef DEBUG
 801         cmpq    postbootkernelbase(%rip), %rdi  /* %rdi = addr */
 802         jnb     0f
 803         leaq    .kzero_panic_msg(%rip), %rdi
 804         jmp     call_panic              /* setup stack and call panic */
 805 0:
 806 #endif
 807         /*
 808          * pass lofault value as 3rd argument for fault return
 809          */
 810         leaq    _kzeroerr(%rip), %rdx
 811 
 812         movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
 813         movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
 814         movq    %rdx, T_LOFAULT(%r9)    /* new lofault */
 815         call    bzero_altentry
 816         xorl    %eax, %eax
 817         movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
 818         ret
 819         /*
 820          * A fault during bzero is indicated through an errno value
 821          * in %rax when we iretq to here.
 822          */
 823 _kzeroerr:
 824         addq    $8, %rsp                /* pop bzero_altentry call ret addr */
 825         movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
 826         ret
 827         SET_SIZE(kzero)
 828 
 829 /*
 830  * Zero a block of storage.
 831  */
 832 
 833         ENTRY(bzero)
 834 #ifdef DEBUG
 835         cmpq    postbootkernelbase(%rip), %rdi  /* %rdi = addr */
 836         jnb     0f
 837         leaq    .bzero_panic_msg(%rip), %rdi
 838         jmp     call_panic              /* setup stack and call panic */
 839 0:
 840 #endif
 841         ALTENTRY(bzero_altentry)
 842 do_zero:
 843 #define L(s) .bzero/**/s
 844         xorl    %eax, %eax
 845 
 846         cmpq    $0x50, %rsi             /* 80 */
 847         jae     L(ck_align)
 848 
 849         /*
 850          * Performance data shows many caller's are zeroing small buffers. So
 851          * for best perf for these sizes unrolled code is used. Store zeros
 852          * without worrying about alignment.
 853          */
 854         leaq    L(setPxQx)(%rip), %r10
 855         addq    %rsi, %rdi
 856         movslq  (%r10,%rsi,4), %rcx
 857         leaq    (%rcx,%r10,1), %r10
 858         INDIRECT_JMP_REG(r10)
 859 
 860         .p2align 4
 861 L(setPxQx):
 862         .int       L(P0Q0)-L(setPxQx)   /* 0 */
 863         .int       L(P1Q0)-L(setPxQx)
 864         .int       L(P2Q0)-L(setPxQx)
 865         .int       L(P3Q0)-L(setPxQx)
 866         .int       L(P4Q0)-L(setPxQx)
 867         .int       L(P5Q0)-L(setPxQx)
 868         .int       L(P6Q0)-L(setPxQx)
 869         .int       L(P7Q0)-L(setPxQx)
 870 
 871         .int       L(P0Q1)-L(setPxQx)   /* 8 */
 872         .int       L(P1Q1)-L(setPxQx)
 873         .int       L(P2Q1)-L(setPxQx)
 874         .int       L(P3Q1)-L(setPxQx)
 875         .int       L(P4Q1)-L(setPxQx)
 876         .int       L(P5Q1)-L(setPxQx)
 877         .int       L(P6Q1)-L(setPxQx)
 878         .int       L(P7Q1)-L(setPxQx)
 879 
 880         .int       L(P0Q2)-L(setPxQx)   /* 16 */
 881         .int       L(P1Q2)-L(setPxQx)
 882         .int       L(P2Q2)-L(setPxQx)
 883         .int       L(P3Q2)-L(setPxQx)
 884         .int       L(P4Q2)-L(setPxQx)
 885         .int       L(P5Q2)-L(setPxQx)
 886         .int       L(P6Q2)-L(setPxQx)
 887         .int       L(P7Q2)-L(setPxQx)
 888 
 889         .int       L(P0Q3)-L(setPxQx)   /* 24 */
 890         .int       L(P1Q3)-L(setPxQx)
 891         .int       L(P2Q3)-L(setPxQx)
 892         .int       L(P3Q3)-L(setPxQx)
 893         .int       L(P4Q3)-L(setPxQx)
 894         .int       L(P5Q3)-L(setPxQx)
 895         .int       L(P6Q3)-L(setPxQx)
 896         .int       L(P7Q3)-L(setPxQx)
 897 
 898         .int       L(P0Q4)-L(setPxQx)   /* 32 */
 899         .int       L(P1Q4)-L(setPxQx)
 900         .int       L(P2Q4)-L(setPxQx)
 901         .int       L(P3Q4)-L(setPxQx)
 902         .int       L(P4Q4)-L(setPxQx)
 903         .int       L(P5Q4)-L(setPxQx)
 904         .int       L(P6Q4)-L(setPxQx)
 905         .int       L(P7Q4)-L(setPxQx)
 906 
 907         .int       L(P0Q5)-L(setPxQx)   /* 40 */
 908         .int       L(P1Q5)-L(setPxQx)
 909         .int       L(P2Q5)-L(setPxQx)
 910         .int       L(P3Q5)-L(setPxQx)
 911         .int       L(P4Q5)-L(setPxQx)
 912         .int       L(P5Q5)-L(setPxQx)
 913         .int       L(P6Q5)-L(setPxQx)
 914         .int       L(P7Q5)-L(setPxQx)
 915 
 916         .int       L(P0Q6)-L(setPxQx)   /* 48 */
 917         .int       L(P1Q6)-L(setPxQx)
 918         .int       L(P2Q6)-L(setPxQx)
 919         .int       L(P3Q6)-L(setPxQx)
 920         .int       L(P4Q6)-L(setPxQx)
 921         .int       L(P5Q6)-L(setPxQx)
 922         .int       L(P6Q6)-L(setPxQx)
 923         .int       L(P7Q6)-L(setPxQx)
 924 
 925         .int       L(P0Q7)-L(setPxQx)   /* 56 */
 926         .int       L(P1Q7)-L(setPxQx)
 927         .int       L(P2Q7)-L(setPxQx)
 928         .int       L(P3Q7)-L(setPxQx)
 929         .int       L(P4Q7)-L(setPxQx)
 930         .int       L(P5Q7)-L(setPxQx)
 931         .int       L(P6Q7)-L(setPxQx)
 932         .int       L(P7Q7)-L(setPxQx)
 933 
 934         .int       L(P0Q8)-L(setPxQx)   /* 64 */
 935         .int       L(P1Q8)-L(setPxQx)
 936         .int       L(P2Q8)-L(setPxQx)
 937         .int       L(P3Q8)-L(setPxQx)
 938         .int       L(P4Q8)-L(setPxQx)
 939         .int       L(P5Q8)-L(setPxQx)
 940         .int       L(P6Q8)-L(setPxQx)
 941         .int       L(P7Q8)-L(setPxQx)
 942 
 943         .int       L(P0Q9)-L(setPxQx)   /* 72 */
 944         .int       L(P1Q9)-L(setPxQx)
 945         .int       L(P2Q9)-L(setPxQx)
 946         .int       L(P3Q9)-L(setPxQx)
 947         .int       L(P4Q9)-L(setPxQx)
 948         .int       L(P5Q9)-L(setPxQx)
 949         .int       L(P6Q9)-L(setPxQx)
 950         .int       L(P7Q9)-L(setPxQx)   /* 79 */
 951 
 952         .p2align 4
 953 L(P0Q9): mov    %rax, -0x48(%rdi)
 954 L(P0Q8): mov    %rax, -0x40(%rdi)
 955 L(P0Q7): mov    %rax, -0x38(%rdi)
 956 L(P0Q6): mov    %rax, -0x30(%rdi)
 957 L(P0Q5): mov    %rax, -0x28(%rdi)
 958 L(P0Q4): mov    %rax, -0x20(%rdi)
 959 L(P0Q3): mov    %rax, -0x18(%rdi)
 960 L(P0Q2): mov    %rax, -0x10(%rdi)
 961 L(P0Q1): mov    %rax, -0x8(%rdi)
 962 L(P0Q0):
 963          ret
 964 
 965         .p2align 4
 966 L(P1Q9): mov    %rax, -0x49(%rdi)
 967 L(P1Q8): mov    %rax, -0x41(%rdi)
 968 L(P1Q7): mov    %rax, -0x39(%rdi)
 969 L(P1Q6): mov    %rax, -0x31(%rdi)
 970 L(P1Q5): mov    %rax, -0x29(%rdi)
 971 L(P1Q4): mov    %rax, -0x21(%rdi)
 972 L(P1Q3): mov    %rax, -0x19(%rdi)
 973 L(P1Q2): mov    %rax, -0x11(%rdi)
 974 L(P1Q1): mov    %rax, -0x9(%rdi)
 975 L(P1Q0): mov    %al, -0x1(%rdi)
 976          ret
 977 
 978         .p2align 4
 979 L(P2Q9): mov    %rax, -0x4a(%rdi)
 980 L(P2Q8): mov    %rax, -0x42(%rdi)
 981 L(P2Q7): mov    %rax, -0x3a(%rdi)
 982 L(P2Q6): mov    %rax, -0x32(%rdi)
 983 L(P2Q5): mov    %rax, -0x2a(%rdi)
 984 L(P2Q4): mov    %rax, -0x22(%rdi)
 985 L(P2Q3): mov    %rax, -0x1a(%rdi)
 986 L(P2Q2): mov    %rax, -0x12(%rdi)
 987 L(P2Q1): mov    %rax, -0xa(%rdi)
 988 L(P2Q0): mov    %ax, -0x2(%rdi)
 989          ret
 990 
 991         .p2align 4
 992 L(P3Q9): mov    %rax, -0x4b(%rdi)
 993 L(P3Q8): mov    %rax, -0x43(%rdi)
 994 L(P3Q7): mov    %rax, -0x3b(%rdi)
 995 L(P3Q6): mov    %rax, -0x33(%rdi)
 996 L(P3Q5): mov    %rax, -0x2b(%rdi)
 997 L(P3Q4): mov    %rax, -0x23(%rdi)
 998 L(P3Q3): mov    %rax, -0x1b(%rdi)
 999 L(P3Q2): mov    %rax, -0x13(%rdi)
1000 L(P3Q1): mov    %rax, -0xb(%rdi)
1001 L(P3Q0): mov    %ax, -0x3(%rdi)
1002          mov    %al, -0x1(%rdi)
1003          ret
1004 
1005         .p2align 4
1006 L(P4Q9): mov    %rax, -0x4c(%rdi)
1007 L(P4Q8): mov    %rax, -0x44(%rdi)
1008 L(P4Q7): mov    %rax, -0x3c(%rdi)
1009 L(P4Q6): mov    %rax, -0x34(%rdi)
1010 L(P4Q5): mov    %rax, -0x2c(%rdi)
1011 L(P4Q4): mov    %rax, -0x24(%rdi)
1012 L(P4Q3): mov    %rax, -0x1c(%rdi)
1013 L(P4Q2): mov    %rax, -0x14(%rdi)
1014 L(P4Q1): mov    %rax, -0xc(%rdi)
1015 L(P4Q0): mov    %eax, -0x4(%rdi)
1016          ret
1017 
1018         .p2align 4
1019 L(P5Q9): mov    %rax, -0x4d(%rdi)
1020 L(P5Q8): mov    %rax, -0x45(%rdi)
1021 L(P5Q7): mov    %rax, -0x3d(%rdi)
1022 L(P5Q6): mov    %rax, -0x35(%rdi)
1023 L(P5Q5): mov    %rax, -0x2d(%rdi)
1024 L(P5Q4): mov    %rax, -0x25(%rdi)
1025 L(P5Q3): mov    %rax, -0x1d(%rdi)
1026 L(P5Q2): mov    %rax, -0x15(%rdi)
1027 L(P5Q1): mov    %rax, -0xd(%rdi)
1028 L(P5Q0): mov    %eax, -0x5(%rdi)
1029          mov    %al, -0x1(%rdi)
1030          ret
1031 
1032         .p2align 4
1033 L(P6Q9): mov    %rax, -0x4e(%rdi)
1034 L(P6Q8): mov    %rax, -0x46(%rdi)
1035 L(P6Q7): mov    %rax, -0x3e(%rdi)
1036 L(P6Q6): mov    %rax, -0x36(%rdi)
1037 L(P6Q5): mov    %rax, -0x2e(%rdi)
1038 L(P6Q4): mov    %rax, -0x26(%rdi)
1039 L(P6Q3): mov    %rax, -0x1e(%rdi)
1040 L(P6Q2): mov    %rax, -0x16(%rdi)
1041 L(P6Q1): mov    %rax, -0xe(%rdi)
1042 L(P6Q0): mov    %eax, -0x6(%rdi)
1043          mov    %ax, -0x2(%rdi)
1044          ret
1045 
1046         .p2align 4
1047 L(P7Q9): mov    %rax, -0x4f(%rdi)
1048 L(P7Q8): mov    %rax, -0x47(%rdi)
1049 L(P7Q7): mov    %rax, -0x3f(%rdi)
1050 L(P7Q6): mov    %rax, -0x37(%rdi)
1051 L(P7Q5): mov    %rax, -0x2f(%rdi)
1052 L(P7Q4): mov    %rax, -0x27(%rdi)
1053 L(P7Q3): mov    %rax, -0x1f(%rdi)
1054 L(P7Q2): mov    %rax, -0x17(%rdi)
1055 L(P7Q1): mov    %rax, -0xf(%rdi)
1056 L(P7Q0): mov    %eax, -0x7(%rdi)
1057          mov    %ax, -0x3(%rdi)
1058          mov    %al, -0x1(%rdi)
1059          ret
1060 
1061         /*
1062          * Align to a 16-byte boundary. Avoids penalties from unaligned stores
1063          * as well as from stores spanning cachelines. Note 16-byte alignment
1064          * is better in case where rep sstosq is used.
1065          */
1066         .p2align 4
1067 L(ck_align):
1068         test    $0xf, %rdi
1069         jz      L(aligned_now)
1070         test    $1, %rdi
1071         jz      2f
1072         mov     %al, (%rdi)
1073         dec     %rsi
1074         lea     1(%rdi),%rdi
1075 2:
1076         test    $2, %rdi
1077         jz      4f
1078         mov     %ax, (%rdi)
1079         sub     $2, %rsi
1080         lea     2(%rdi),%rdi
1081 4:
1082         test    $4, %rdi
1083         jz      8f
1084         mov     %eax, (%rdi)
1085         sub     $4, %rsi
1086         lea     4(%rdi),%rdi
1087 8:
1088         test    $8, %rdi
1089         jz      L(aligned_now)
1090         mov     %rax, (%rdi)
1091         sub     $8, %rsi
1092         lea     8(%rdi),%rdi
1093 
1094         /*
1095          * For large sizes rep sstoq is fastest.
1096          * Transition point determined experimentally as measured on
1097          * Intel Xeon processors (incl. Nehalem) and AMD Opteron.
1098          */
1099 L(aligned_now):
1100         cmp     $BZERO_USE_REP, %rsi
1101         ja      L(use_rep)
1102 
1103         /*
1104          * zero 64-bytes per loop
1105          */
1106         .p2align 4
1107 L(bzero_loop):
1108         leaq    -0x40(%rsi), %rsi
1109         cmpq    $0x40, %rsi
1110         movq    %rax, (%rdi)
1111         movq    %rax, 0x8(%rdi)
1112         movq    %rax, 0x10(%rdi)
1113         movq    %rax, 0x18(%rdi)
1114         movq    %rax, 0x20(%rdi)
1115         movq    %rax, 0x28(%rdi)
1116         movq    %rax, 0x30(%rdi)
1117         movq    %rax, 0x38(%rdi)
1118         leaq    0x40(%rdi), %rdi
1119         jae     L(bzero_loop)
1120 
1121         /*
1122          * Clear any remaining bytes..
1123          */
1124 9:
1125         leaq    L(setPxQx)(%rip), %r10
1126         addq    %rsi, %rdi
1127         movslq  (%r10,%rsi,4), %rcx
1128         leaq    (%rcx,%r10,1), %r10
1129         INDIRECT_JMP_REG(r10)
1130 
1131         /*
1132          * Use rep sstoq. Clear any remainder via unrolled code
1133          */
1134         .p2align 4
1135 L(use_rep):
1136         movq    %rsi, %rcx              /* get size in bytes */
1137         shrq    $3, %rcx                /* count of 8-byte words to zero */
1138         rep
1139           sstoq                         /* %rcx = words to clear (%rax=0) */
1140         andq    $7, %rsi                /* remaining bytes */
1141         jnz     9b
1142         ret
1143 #undef  L
1144         SET_SIZE(bzero_altentry)
1145         SET_SIZE(bzero)
1146 
1147 /*
1148  * Transfer data to and from user space -
1149  * Note that these routines can cause faults
1150  * It is assumed that the kernel has nothing at
1151  * less than KERNELBASE in the virtual address space.
1152  *
1153  * Note that copyin(9F) and copyout(9F) are part of the
1154  * DDI/DKI which specifies that they return '-1' on "errors."
1155  *
1156  * Sigh.
1157  *
1158  * So there's two extremely similar routines - xcopyin_nta() and
1159  * xcopyout_nta() which return the errno that we've faithfully computed.
1160  * This allows other callers (e.g. uiomove(9F)) to work correctly.
1161  * Given that these are used pretty heavily, we expand the calling
1162  * sequences inline for all flavours (rather than making wrappers).
1163  */
1164 
1165 /*
1166  * Copy user data to kernel space.
1167  */
1168 
1169         ENTRY(copyin)
1170         pushq   %rbp
1171         movq    %rsp, %rbp
1172         subq    $24, %rsp
1173 
1174         /*
1175          * save args in case we trap and need to rerun as a copyop
1176          */
1177         movq    %rdi, (%rsp)
1178         movq    %rsi, 0x8(%rsp)
1179         movq    %rdx, 0x10(%rsp)
1180 
1181         movq    kernelbase(%rip), %rax
1182 #ifdef DEBUG
1183         cmpq    %rax, %rsi              /* %rsi = kaddr */
1184         jnb     1f
1185         leaq    .copyin_panic_msg(%rip), %rdi
1186         xorl    %eax, %eax
1187         call    panic
1188 1:
1189 #endif
1190         /*
1191          * pass lofault value as 4th argument to do_copy_fault
1192          */
1193         leaq    _copyin_err(%rip), %rcx
1194 
1195         movq    %gs:CPU_THREAD, %r9
1196         cmpq    %rax, %rdi              /* test uaddr < kernelbase */
1197         jae     3f                      /* take copyop if uaddr > kernelbase */
1198         SMAP_DISABLE_INSTR(0)
1199         jmp     do_copy_fault           /* Takes care of leave for us */
1200 
1201 _copyin_err:
1202         SMAP_ENABLE_INSTR(2)
1203         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1204         addq    $8, %rsp                /* pop bcopy_altentry call ret addr */
1205 3:
1206         movq    T_COPYOPS(%r9), %rax
1207         cmpq    $0, %rax
1208         jz      2f
1209         /*
1210          * reload args for the copyop
1211          */
1212         movq    (%rsp), %rdi
1213         movq    0x8(%rsp), %rsi
1214         movq    0x10(%rsp), %rdx
1215         leave
1216         movq    CP_COPYIN(%rax), %rax
1217         INDIRECT_JMP_REG(rax)
1218 
1219 2:      movl    $-1, %eax
1220         leave
1221         ret
1222         SET_SIZE(copyin)
1223 
1224         ENTRY(xcopyin_nta)
1225         pushq   %rbp
1226         movq    %rsp, %rbp
1227         subq    $24, %rsp
1228 
1229         /*
1230          * save args in case we trap and need to rerun as a copyop
1231          * %rcx is consumed in this routine so we don't need to save
1232          * it.
1233          */
1234         movq    %rdi, (%rsp)
1235         movq    %rsi, 0x8(%rsp)
1236         movq    %rdx, 0x10(%rsp)
1237 
1238         movq    kernelbase(%rip), %rax
1239 #ifdef DEBUG
1240         cmpq    %rax, %rsi              /* %rsi = kaddr */
1241         jnb     1f
1242         leaq    .xcopyin_panic_msg(%rip), %rdi
1243         xorl    %eax, %eax
1244         call    panic
1245 1:
1246 #endif
1247         movq    %gs:CPU_THREAD, %r9
1248         cmpq    %rax, %rdi              /* test uaddr < kernelbase */
1249         jae     4f
1250         cmpq    $0, %rcx                /* No non-temporal access? */
1251         /*
1252          * pass lofault value as 4th argument to do_copy_fault
1253          */
1254         leaq    _xcopyin_err(%rip), %rcx        /* doesn't set rflags */
1255         jnz     6f                      /* use regular access */
1256         /*
1257          * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1258          */
1259         cmpq    $XCOPY_MIN_SIZE, %rdx
1260         jae     5f
1261 6:
1262         SMAP_DISABLE_INSTR(1)
1263         jmp     do_copy_fault
1264 
1265         /*
1266          * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1267          * count is COUNT_ALIGN_SIZE aligned.
1268          */
1269 5:
1270         movq    %rdi, %r10
1271         orq     %rsi, %r10
1272         andq    $NTA_ALIGN_MASK, %r10
1273         orq     %rdx, %r10
1274         andq    $COUNT_ALIGN_MASK, %r10
1275         jnz     6b
1276         leaq    _xcopyin_nta_err(%rip), %rcx    /* doesn't set rflags */
1277         SMAP_DISABLE_INSTR(2)
1278         jmp     do_copy_fault_nta       /* use non-temporal access */
1279 
1280 4:
1281         movl    $EFAULT, %eax
1282         jmp     3f
1283 
1284         /*
1285          * A fault during do_copy_fault or do_copy_fault_nta is
1286          * indicated through an errno value in %rax and we iret from the
1287          * trap handler to here.
1288          */
1289 _xcopyin_err:
1290         addq    $8, %rsp                /* pop bcopy_altentry call ret addr */
1291 _xcopyin_nta_err:
1292         SMAP_ENABLE_INSTR(3)
1293         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1294 3:
1295         movq    T_COPYOPS(%r9), %r8
1296         cmpq    $0, %r8
1297         jz      2f
1298 
1299         /*
1300          * reload args for the copyop
1301          */
1302         movq    (%rsp), %rdi
1303         movq    0x8(%rsp), %rsi
1304         movq    0x10(%rsp), %rdx
1305         leave
1306         movq    CP_XCOPYIN(%r8), %r8
1307         INDIRECT_JMP_REG(r8)
1308 
1309 2:      leave
1310         ret
1311         SET_SIZE(xcopyin_nta)
1312 
1313 /*
1314  * Copy kernel data to user space.
1315  */
1316 
1317         ENTRY(copyout)
1318         pushq   %rbp
1319         movq    %rsp, %rbp
1320         subq    $24, %rsp
1321 
1322         /*
1323          * save args in case we trap and need to rerun as a copyop
1324          */
1325         movq    %rdi, (%rsp)
1326         movq    %rsi, 0x8(%rsp)
1327         movq    %rdx, 0x10(%rsp)
1328 
1329         movq    kernelbase(%rip), %rax
1330 #ifdef DEBUG
1331         cmpq    %rax, %rdi              /* %rdi = kaddr */
1332         jnb     1f
1333         leaq    .copyout_panic_msg(%rip), %rdi
1334         xorl    %eax, %eax
1335         call    panic
1336 1:
1337 #endif
1338         /*
1339          * pass lofault value as 4th argument to do_copy_fault
1340          */
1341         leaq    _copyout_err(%rip), %rcx
1342 
1343         movq    %gs:CPU_THREAD, %r9
1344         cmpq    %rax, %rsi              /* test uaddr < kernelbase */
1345         jae     3f                      /* take copyop if uaddr > kernelbase */
1346         SMAP_DISABLE_INSTR(3)
1347         jmp     do_copy_fault           /* Calls leave for us */
1348 
1349 _copyout_err:
1350         SMAP_ENABLE_INSTR(4)
1351         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1352         addq    $8, %rsp                /* pop bcopy_altentry call ret addr */
1353 3:
1354         movq    T_COPYOPS(%r9), %rax
1355         cmpq    $0, %rax
1356         jz      2f
1357 
1358         /*
1359          * reload args for the copyop
1360          */
1361         movq    (%rsp), %rdi
1362         movq    0x8(%rsp), %rsi
1363         movq    0x10(%rsp), %rdx
1364         leave
1365         movq    CP_COPYOUT(%rax), %rax
1366         INDIRECT_JMP_REG(rax)
1367 
1368 2:      movl    $-1, %eax
1369         leave
1370         ret
1371         SET_SIZE(copyout)
1372 
1373         ENTRY(xcopyout_nta)
1374         pushq   %rbp
1375         movq    %rsp, %rbp
1376         subq    $24, %rsp
1377 
1378         /*
1379          * save args in case we trap and need to rerun as a copyop
1380          */
1381         movq    %rdi, (%rsp)
1382         movq    %rsi, 0x8(%rsp)
1383         movq    %rdx, 0x10(%rsp)
1384 
1385         movq    kernelbase(%rip), %rax
1386 #ifdef DEBUG
1387         cmpq    %rax, %rdi              /* %rdi = kaddr */
1388         jnb     1f
1389         leaq    .xcopyout_panic_msg(%rip), %rdi
1390         xorl    %eax, %eax
1391         call    panic
1392 1:
1393 #endif
1394         movq    %gs:CPU_THREAD, %r9
1395         cmpq    %rax, %rsi              /* test uaddr < kernelbase */
1396         jae     4f
1397 
1398         cmpq    $0, %rcx                /* No non-temporal access? */
1399         /*
1400          * pass lofault value as 4th argument to do_copy_fault
1401          */
1402         leaq    _xcopyout_err(%rip), %rcx
1403         jnz     6f
1404         /*
1405          * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1406          */
1407         cmpq    $XCOPY_MIN_SIZE, %rdx
1408         jae     5f
1409 6:
1410         SMAP_DISABLE_INSTR(4)
1411         jmp     do_copy_fault
1412 
1413         /*
1414          * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1415          * count is COUNT_ALIGN_SIZE aligned.
1416          */
1417 5:
1418         movq    %rdi, %r10
1419         orq     %rsi, %r10
1420         andq    $NTA_ALIGN_MASK, %r10
1421         orq     %rdx, %r10
1422         andq    $COUNT_ALIGN_MASK, %r10
1423         jnz     6b
1424         leaq    _xcopyout_nta_err(%rip), %rcx
1425         SMAP_DISABLE_INSTR(5)
1426         call    do_copy_fault_nta
1427         SMAP_ENABLE_INSTR(5)
1428         ret
1429 
1430 4:
1431         movl    $EFAULT, %eax
1432         jmp     3f
1433 
1434         /*
1435          * A fault during do_copy_fault or do_copy_fault_nta is
1436          * indicated through an errno value in %rax and we iret from the
1437          * trap handler to here.
1438          */
1439 _xcopyout_err:
1440         addq    $8, %rsp                /* pop bcopy_altentry call ret addr */
1441 _xcopyout_nta_err:
1442         SMAP_ENABLE_INSTR(6)
1443         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1444 3:
1445         movq    T_COPYOPS(%r9), %r8
1446         cmpq    $0, %r8
1447         jz      2f
1448 
1449         /*
1450          * reload args for the copyop
1451          */
1452         movq    (%rsp), %rdi
1453         movq    0x8(%rsp), %rsi
1454         movq    0x10(%rsp), %rdx
1455         leave
1456         movq    CP_XCOPYOUT(%r8), %r8
1457         INDIRECT_JMP_REG(r8)
1458 
1459 2:      leave
1460         ret
1461         SET_SIZE(xcopyout_nta)
1462 
1463 /*
1464  * Copy a null terminated string from one point to another in
1465  * the kernel address space.
1466  */
1467 
1468         ENTRY(copystr)
1469         pushq   %rbp
1470         movq    %rsp, %rbp
1471 #ifdef DEBUG
1472         movq    kernelbase(%rip), %rax
1473         cmpq    %rax, %rdi              /* %rdi = from */
1474         jb      0f
1475         cmpq    %rax, %rsi              /* %rsi = to */
1476         jnb     1f
1477 0:      leaq    .copystr_panic_msg(%rip), %rdi
1478         xorl    %eax, %eax
1479         call    panic
1480 1:
1481 #endif
1482         movq    %gs:CPU_THREAD, %r9
1483         movq    T_LOFAULT(%r9), %r8     /* pass current lofault value as */
1484                                         /* 5th argument to do_copystr */
1485         xorl    %r10d,%r10d             /* pass smap restore need in %r10d */
1486                                         /* as a non-ABI 6th arg */
1487 do_copystr:
1488         movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
1489         movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
1490         movq    %r8, T_LOFAULT(%r9)     /* new lofault */
1491 
1492         movq    %rdx, %r8               /* save maxlength */
1493 
1494         cmpq    $0, %rdx                /* %rdx = maxlength */
1495         je      copystr_enametoolong    /* maxlength == 0 */
1496 
1497 copystr_loop:
1498         decq    %r8
1499         movb    (%rdi), %al
1500         incq    %rdi
1501         movb    %al, (%rsi)
1502         incq    %rsi
1503         cmpb    $0, %al
1504         je      copystr_null            /* null char */
1505         cmpq    $0, %r8
1506         jne     copystr_loop
1507 
1508 copystr_enametoolong:
1509         movl    $ENAMETOOLONG, %eax
1510         jmp     copystr_out
1511 
1512 copystr_null:
1513         xorl    %eax, %eax              /* no error */
1514 
1515 copystr_out:
1516         cmpq    $0, %rcx                /* want length? */
1517         je      copystr_smap            /* no */
1518         subq    %r8, %rdx               /* compute length and store it */
1519         movq    %rdx, (%rcx)
1520 
1521 copystr_smap:
1522         cmpl    $0, %r10d
1523         jz      copystr_done
1524         SMAP_ENABLE_INSTR(7)
1525 
1526 copystr_done:
1527         movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
1528         leave
1529         ret
1530         SET_SIZE(copystr)
1531 
1532 /*
1533  * Copy a null terminated string from the user address space into
1534  * the kernel address space.
1535  */
1536 
1537         ENTRY(copyinstr)
1538         pushq   %rbp
1539         movq    %rsp, %rbp
1540         subq    $32, %rsp
1541 
1542         /*
1543          * save args in case we trap and need to rerun as a copyop
1544          */
1545         movq    %rdi, (%rsp)
1546         movq    %rsi, 0x8(%rsp)
1547         movq    %rdx, 0x10(%rsp)
1548         movq    %rcx, 0x18(%rsp)
1549 
1550         movq    kernelbase(%rip), %rax
1551 #ifdef DEBUG
1552         cmpq    %rax, %rsi              /* %rsi = kaddr */
1553         jnb     1f
1554         leaq    .copyinstr_panic_msg(%rip), %rdi
1555         xorl    %eax, %eax
1556         call    panic
1557 1:
1558 #endif
1559         /*
1560          * pass lofault value as 5th argument to do_copystr
1561          * do_copystr expects whether or not we need smap in %r10d
1562          */
1563         leaq    _copyinstr_error(%rip), %r8
1564         movl    $1, %r10d
1565 
1566         cmpq    %rax, %rdi              /* test uaddr < kernelbase */
1567         jae     4f
1568         SMAP_DISABLE_INSTR(6)
1569         jmp     do_copystr
1570 4:
1571         movq    %gs:CPU_THREAD, %r9
1572         jmp     3f
1573 
1574 _copyinstr_error:
1575         SMAP_ENABLE_INSTR(8)
1576         movq    %r11, T_LOFAULT(%r9)    /* restore original lofault */
1577 3:
1578         movq    T_COPYOPS(%r9), %rax
1579         cmpq    $0, %rax
1580         jz      2f
1581 
1582         /*
1583          * reload args for the copyop
1584          */
1585         movq    (%rsp), %rdi
1586         movq    0x8(%rsp), %rsi
1587         movq    0x10(%rsp), %rdx
1588         movq    0x18(%rsp), %rcx
1589         leave
1590         movq    CP_COPYINSTR(%rax), %rax
1591         INDIRECT_JMP_REG(rax)
1592 
1593 2:      movl    $EFAULT, %eax           /* return EFAULT */
1594         leave
1595         ret
1596         SET_SIZE(copyinstr)
1597 
1598 /*
1599  * Copy a null terminated string from the kernel
1600  * address space to the user address space.
1601  */
1602 
1603         ENTRY(copyoutstr)
1604         pushq   %rbp
1605         movq    %rsp, %rbp
1606         subq    $32, %rsp
1607 
1608         /*
1609          * save args in case we trap and need to rerun as a copyop
1610          */
1611         movq    %rdi, (%rsp)
1612         movq    %rsi, 0x8(%rsp)
1613         movq    %rdx, 0x10(%rsp)
1614         movq    %rcx, 0x18(%rsp)
1615 
1616         movq    kernelbase(%rip), %rax
1617 #ifdef DEBUG
1618         cmpq    %rax, %rdi              /* %rdi = kaddr */
1619         jnb     1f
1620         leaq    .copyoutstr_panic_msg(%rip), %rdi
1621         jmp     call_panic              /* setup stack and call panic */
1622 1:
1623 #endif
1624         /*
1625          * pass lofault value as 5th argument to do_copystr
1626          * pass one as 6th argument to do_copystr in %r10d
1627          */
1628         leaq    _copyoutstr_error(%rip), %r8
1629         movl    $1, %r10d
1630 
1631         cmpq    %rax, %rsi              /* test uaddr < kernelbase */
1632         jae     4f
1633         SMAP_DISABLE_INSTR(7)
1634         jmp     do_copystr
1635 4:
1636         movq    %gs:CPU_THREAD, %r9
1637         jmp     3f
1638 
1639 _copyoutstr_error:
1640         SMAP_ENABLE_INSTR(9)
1641         movq    %r11, T_LOFAULT(%r9)    /* restore the original lofault */
1642 3:
1643         movq    T_COPYOPS(%r9), %rax
1644         cmpq    $0, %rax
1645         jz      2f
1646 
1647         /*
1648          * reload args for the copyop
1649          */
1650         movq    (%rsp), %rdi
1651         movq    0x8(%rsp), %rsi
1652         movq    0x10(%rsp), %rdx
1653         movq    0x18(%rsp), %rcx
1654         leave
1655         movq    CP_COPYOUTSTR(%rax), %rax
1656         INDIRECT_JMP_REG(rax)
1657 
1658 2:      movl    $EFAULT, %eax           /* return EFAULT */
1659         leave
1660         ret
1661         SET_SIZE(copyoutstr)
1662 
1663 /*
1664  * Since all of the fuword() variants are so similar, we have a macro to spit
1665  * them out.  This allows us to create DTrace-unobservable functions easily.
1666  */
1667 
1668 /*
1669  * Note that we don't save and reload the arguments here
1670  * because their values are not altered in the copy path.
1671  * Additionally, when successful, the smap_enable jmp will
1672  * actually return us to our original caller.
1673  */
1674 
1675 #define FUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)      \
1676         ENTRY(NAME)                             \
1677         movq    %gs:CPU_THREAD, %r9;            \
1678         cmpq    kernelbase(%rip), %rdi;         \
1679         jae     1f;                             \
1680         leaq    _flt_/**/NAME, %rdx;            \
1681         movq    %rdx, T_LOFAULT(%r9);           \
1682         SMAP_DISABLE_INSTR(DISNUM)              \
1683         INSTR   (%rdi), REG;                    \
1684         movq    $0, T_LOFAULT(%r9);             \
1685         INSTR   REG, (%rsi);                    \
1686         xorl    %eax, %eax;                     \
1687         SMAP_ENABLE_INSTR(EN1)                  \
1688         ret;                                    \
1689 _flt_/**/NAME:                                  \
1690         SMAP_ENABLE_INSTR(EN2)                  \
1691         movq    $0, T_LOFAULT(%r9);             \
1692 1:                                              \
1693         movq    T_COPYOPS(%r9), %rax;           \
1694         cmpq    $0, %rax;                       \
1695         jz      2f;                             \
1696         movq    COPYOP(%rax), %rax;             \
1697         INDIRECT_JMP_REG(rax);                  \
1698 2:                                              \
1699         movl    $-1, %eax;                      \
1700         ret;                                    \
1701         SET_SIZE(NAME)
1702 
1703         FUWORD(fuword64, movq, %rax, CP_FUWORD64,8,10,11)
1704         FUWORD(fuword32, movl, %eax, CP_FUWORD32,9,12,13)
1705         FUWORD(fuword16, movw, %ax, CP_FUWORD16,10,14,15)
1706         FUWORD(fuword8, movb, %al, CP_FUWORD8,11,16,17)
1707 
1708 #undef  FUWORD
1709 
1710 /*
1711  * Set user word.
1712  */
1713 
1714 /*
1715  * Note that we don't save and reload the arguments here
1716  * because their values are not altered in the copy path.
1717  */
1718 
1719 #define SUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)      \
1720         ENTRY(NAME)                             \
1721         movq    %gs:CPU_THREAD, %r9;            \
1722         cmpq    kernelbase(%rip), %rdi;         \
1723         jae     1f;                             \
1724         leaq    _flt_/**/NAME, %rdx;            \
1725         SMAP_DISABLE_INSTR(DISNUM)              \
1726         movq    %rdx, T_LOFAULT(%r9);           \
1727         INSTR   REG, (%rdi);                    \
1728         movq    $0, T_LOFAULT(%r9);             \
1729         xorl    %eax, %eax;                     \
1730         SMAP_ENABLE_INSTR(EN1)                  \
1731         ret;                                    \
1732 _flt_/**/NAME:                                  \
1733         SMAP_ENABLE_INSTR(EN2)                  \
1734         movq    $0, T_LOFAULT(%r9);             \
1735 1:                                              \
1736         movq    T_COPYOPS(%r9), %rax;           \
1737         cmpq    $0, %rax;                       \
1738         jz      3f;                             \
1739         movq    COPYOP(%rax), %rax;             \
1740         INDIRECT_JMP_REG(rax);                  \
1741 3:                                              \
1742         movl    $-1, %eax;                      \
1743         ret;                                    \
1744         SET_SIZE(NAME)
1745 
1746         SUWORD(suword64, movq, %rsi, CP_SUWORD64,12,18,19)
1747         SUWORD(suword32, movl, %esi, CP_SUWORD32,13,20,21)
1748         SUWORD(suword16, movw, %si, CP_SUWORD16,14,22,23)
1749         SUWORD(suword8, movb, %sil, CP_SUWORD8,15,24,25)
1750 
1751 #undef  SUWORD
1752 
1753 #define FUWORD_NOERR(NAME, INSTR, REG)          \
1754         ENTRY(NAME)                             \
1755         cmpq    kernelbase(%rip), %rdi;         \
1756         cmovnbq kernelbase(%rip), %rdi;         \
1757         INSTR   (%rdi), REG;                    \
1758         INSTR   REG, (%rsi);                    \
1759         ret;                                    \
1760         SET_SIZE(NAME)
1761 
1762         FUWORD_NOERR(fuword64_noerr, movq, %rax)
1763         FUWORD_NOERR(fuword32_noerr, movl, %eax)
1764         FUWORD_NOERR(fuword16_noerr, movw, %ax)
1765         FUWORD_NOERR(fuword8_noerr, movb, %al)
1766 
1767 #undef  FUWORD_NOERR
1768 
1769 #define SUWORD_NOERR(NAME, INSTR, REG)          \
1770         ENTRY(NAME)                             \
1771         cmpq    kernelbase(%rip), %rdi;         \
1772         cmovnbq kernelbase(%rip), %rdi;         \
1773         INSTR   REG, (%rdi);                    \
1774         ret;                                    \
1775         SET_SIZE(NAME)
1776 
1777         SUWORD_NOERR(suword64_noerr, movq, %rsi)
1778         SUWORD_NOERR(suword32_noerr, movl, %esi)
1779         SUWORD_NOERR(suword16_noerr, movw, %si)
1780         SUWORD_NOERR(suword8_noerr, movb, %sil)
1781 
1782 #undef  SUWORD_NOERR
1783 
1784 
1785         .weak   subyte
1786         subyte=suword8
1787         .weak   subyte_noerr
1788         subyte_noerr=suword8_noerr
1789 
1790         .weak   fulword
1791         fulword=fuword64
1792         .weak   fulword_noerr
1793         fulword_noerr=fuword64_noerr
1794         .weak   sulword
1795         sulword=suword64
1796         .weak   sulword_noerr
1797         sulword_noerr=suword64_noerr
1798 
1799         ENTRY(copyin_noerr)
1800         movq    kernelbase(%rip), %rax
1801 #ifdef DEBUG
1802         cmpq    %rax, %rsi              /* %rsi = kto */
1803         jae     1f
1804         leaq    .cpyin_ne_pmsg(%rip), %rdi
1805         jmp     call_panic              /* setup stack and call panic */
1806 1:
1807 #endif
1808         cmpq    %rax, %rdi              /* ufrom < kernelbase */
1809         jb      do_copy
1810         movq    %rax, %rdi              /* force fault at kernelbase */
1811         jmp     do_copy
1812         SET_SIZE(copyin_noerr)
1813 
1814         ENTRY(copyout_noerr)
1815         movq    kernelbase(%rip), %rax
1816 #ifdef DEBUG
1817         cmpq    %rax, %rdi              /* %rdi = kfrom */
1818         jae     1f
1819         leaq    .cpyout_ne_pmsg(%rip), %rdi
1820         jmp     call_panic              /* setup stack and call panic */
1821 1:
1822 #endif
1823         cmpq    %rax, %rsi              /* uto < kernelbase */
1824         jb      do_copy
1825         movq    %rax, %rsi              /* force fault at kernelbase */
1826         jmp     do_copy
1827         SET_SIZE(copyout_noerr)
1828 
1829         ENTRY(uzero)
1830         movq    kernelbase(%rip), %rax
1831         cmpq    %rax, %rdi
1832         jb      do_zero
1833         movq    %rax, %rdi      /* force fault at kernelbase */
1834         jmp     do_zero
1835         SET_SIZE(uzero)
1836 
1837         ENTRY(ucopy)
1838         movq    kernelbase(%rip), %rax
1839         cmpq    %rax, %rdi
1840         cmovaeq %rax, %rdi      /* force fault at kernelbase */
1841         cmpq    %rax, %rsi
1842         cmovaeq %rax, %rsi      /* force fault at kernelbase */
1843         jmp     do_copy
1844         SET_SIZE(ucopy)
1845 
1846         /*
1847          * Note, the frame pointer is required here becuase do_copystr expects
1848          * to be able to pop it off!
1849          */
1850         ENTRY(ucopystr)
1851         pushq   %rbp
1852         movq    %rsp, %rbp
1853         movq    kernelbase(%rip), %rax
1854         cmpq    %rax, %rdi
1855         cmovaeq %rax, %rdi      /* force fault at kernelbase */
1856         cmpq    %rax, %rsi
1857         cmovaeq %rax, %rsi      /* force fault at kernelbase */
1858         /* do_copystr expects lofault address in %r8 */
1859         /* do_copystr expects whether or not we need smap in %r10 */
1860         xorl    %r10d, %r10d
1861         movq    %gs:CPU_THREAD, %r8
1862         movq    T_LOFAULT(%r8), %r8
1863         jmp     do_copystr
1864         SET_SIZE(ucopystr)
1865 
1866 #ifdef DEBUG
1867         .data
1868 .kcopy_panic_msg:
1869         .string "kcopy: arguments below kernelbase"
1870 .bcopy_panic_msg:
1871         .string "bcopy: arguments below kernelbase"
1872 .kzero_panic_msg:
1873         .string "kzero: arguments below kernelbase"
1874 .bzero_panic_msg:
1875         .string "bzero: arguments below kernelbase"
1876 .copyin_panic_msg:
1877         .string "copyin: kaddr argument below kernelbase"
1878 .xcopyin_panic_msg:
1879         .string "xcopyin: kaddr argument below kernelbase"
1880 .copyout_panic_msg:
1881         .string "copyout: kaddr argument below kernelbase"
1882 .xcopyout_panic_msg:
1883         .string "xcopyout: kaddr argument below kernelbase"
1884 .copystr_panic_msg:
1885         .string "copystr: arguments in user space"
1886 .copyinstr_panic_msg:
1887         .string "copyinstr: kaddr argument not in kernel address space"
1888 .copyoutstr_panic_msg:
1889         .string "copyoutstr: kaddr argument not in kernel address space"
1890 .cpyin_ne_pmsg:
1891         .string "copyin_noerr: argument not in kernel address space"
1892 .cpyout_ne_pmsg:
1893         .string "copyout_noerr: argument not in kernel address space"
1894 #endif
1895 
1896 .data
1897 .align  4
1898 .globl  _smap_enable_patch_count
1899 .type   _smap_enable_patch_count,@object
1900 .size   _smap_enable_patch_count, 4
1901 _smap_enable_patch_count:
1902         .long   SMAP_ENABLE_COUNT
1903 
1904 .globl  _smap_disable_patch_count
1905 .type   _smap_disable_patch_count,@object
1906 .size   _smap_disable_patch_count, 4
1907 _smap_disable_patch_count:
1908         .long SMAP_DISABLE_COUNT