1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /*
  12  * Copyright 2018 Joyent, Inc.
  13  */
  14 
  15 /*
  16  * This file contains the trampolines that are used by KPTI in order to be
  17  * able to take interrupts/trap/etc while on the "user" page table.
  18  *
  19  * We don't map the full kernel text into the user page table: instead we
  20  * map this one small section of trampolines (which compiles to ~13 pages).
  21  * These trampolines are set in the IDT always (so they will run no matter
  22  * whether we're on the kernel or user page table), and their primary job is to
  23  * pivot us to the kernel %cr3 and %rsp without ruining everything.
  24  *
  25  * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
  26  * meaning that they will execute with their %rsp set to a known location, even
  27  * if we take them in the kernel.
  28  *
  29  * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
  30  * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
  31  * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
  32  * page-aligned, and we map the page it's on into both page tables. Using a
  33  * struct attached to the cpu_t also means that we can use %rsp-relative
  34  * addressing to find anything on the cpu_t, so we don't have to touch %gs or
  35  * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy).
  36  *
  37  * This little struct is where the CPU will push the actual interrupt frame.
  38  * Then, in the trampoline, we change %cr3, then figure out our destination
  39  * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt
  40  * frame). Then we jump to the regular ISR in the kernel text and carry on as
  41  * normal.
  42  *
  43  * We leave the original frame and any spilled regs behind in the kpti_frame
  44  * lazily until we want to return to userland. Then, we clear any spilled
  45  * regs from it, and overwrite the rest with our iret frame. When switching
  46  * this cpu to a different process (in hat_switch), we bzero the whole region to
  47  * make sure nothing can leak between processes.
  48  *
  49  * When we're returning back to the original place we took the interrupt later
  50  * (especially if it was in userland), we have to jmp back to the "return
  51  * trampolines" here, since when we set %cr3 back to the user value, we need to
  52  * be executing from code here in these shared pages and not the main kernel
  53  * text again. Even though it should be fine to iret directly from kernel text
  54  * when returning to kernel code, we make things jmp to a trampoline here just
  55  * for consistency.
  56  *
  57  * Note that with IST, it's very important that we always must have pivoted
  58  * away from the IST stack before we could possibly take any other interrupt
  59  * on the same IST (unless it's an end-of-the-world fault and we don't care
  60  * about coming back from it ever).
  61  *
  62  * This is particularly relevant to the dbgtrap/brktrap trampolines, as they
  63  * regularly have to happen from within trampoline code (e.g. in the sysenter
  64  * single-step case) and then return to the world normally. As a result, these
  65  * two are IST'd to their own kpti_frame right above the normal one (in the same
  66  * page), so they don't clobber their parent interrupt.
  67  *
  68  * To aid with debugging, we also IST the page fault (#PF/pftrap), general
  69  * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to
  70  * their own separate kpti_frame. This ensures that if we take one of these
  71  * due to a bug in trampoline code, we preserve the original trampoline
  72  * state that caused the trap.
  73  *
  74  * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
  75  * stacks, since they can interrupt another ISR at any time. These stacks are
  76  * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
  77  * their trampolines (and do it unconditionally), and don't bother pivoting
  78  * away. We're either going into the panic() path, or we're going to return
  79  * straight away without rescheduling, so it's fine to not be on our real
  80  * kthread stack (and some of the state we want to go find it with might be
  81  * corrupt!)
  82  *
  83  * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
  84  * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
  85  * point at the PML4 for kas early in boot and never touch it again. Hopefully
  86  * it survives whatever corruption brings down the rest of the kernel!
  87  *
  88  * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
  89  * cases) in that they do not push an interrupt frame (and also have some other
  90  * effects). In the syscall trampolines, we assume that we can only be taking
  91  * the call from userland and use SWAPGS and an unconditional overwrite of %cr3.
  92  * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
  93  * existing %rsp pivot untouched) -- instead we spill registers into
  94  * %gs:CPU_KPTI_* as we need to.
  95  *
  96  * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
  97  * hat_switch().
  98  */
  99 
 100 /*
 101  * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
 102  * fix bugs here check to see if they should be fixed there as well.
 103  */
 104 
 105 #include <sys/asm_linkage.h>
 106 #include <sys/asm_misc.h>
 107 #include <sys/regset.h>
 108 #include <sys/privregs.h>
 109 #include <sys/psw.h>
 110 #include <sys/machbrand.h>
 111 #include <sys/param.h>
 112 
 113 #if defined(__lint)
 114 
 115 #include <sys/types.h>
 116 #include <sys/thread.h>
 117 #include <sys/systm.h>
 118 
 119 #else   /* __lint */
 120 
 121 #include <sys/segments.h>
 122 #include <sys/pcb.h>
 123 #include <sys/trap.h>
 124 #include <sys/ftrace.h>
 125 #include <sys/traptrace.h>
 126 #include <sys/clock.h>
 127 #include <sys/model.h>
 128 #include <sys/panic.h>
 129 
 130 #if defined(__xpv)
 131 #include <sys/hypervisor.h>
 132 #endif
 133 
 134 #include "assym.h"
 135 
 136         .data
 137         DGDEF3(kpti_enable, 8, 8)
 138         .fill   1, 8, 1
 139 
 140 #if DEBUG
 141         .data
 142 _bad_ts_panic_msg:
 143         .string "kpti_trampolines.s: tr_iret_user but CR0.TS set"
 144 #endif
 145 
 146 .section ".text";
 147 .align MMU_PAGESIZE
 148 
 149 .global kpti_tramp_start
 150 kpti_tramp_start:
 151         nop
 152 
 153 /* This will be set by mlsetup, and then double-checked later */
 154 .global kpti_safe_cr3
 155 kpti_safe_cr3:
 156         .quad 0
 157         SET_SIZE(kpti_safe_cr3)
 158 
 159 /* startup_kmem() will overwrite this */
 160 .global kpti_kbase
 161 kpti_kbase:
 162         .quad KERNELBASE
 163         SET_SIZE(kpti_kbase)
 164 
 165 #define SET_KERNEL_CR3(spillreg)                \
 166         mov     %cr3, spillreg;                 \
 167         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 168         mov     %gs:CPU_KPTI_KCR3, spillreg;    \
 169         cmp     $0, spillreg;                   \
 170         je      2f;                             \
 171         mov     spillreg, %cr3;                 \
 172 2:
 173 
 174 #if DEBUG
 175 #define SET_USER_CR3(spillreg)                  \
 176         mov     %cr3, spillreg;                 \
 177         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 178         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 179         mov     spillreg, %cr3
 180 #else
 181 #define SET_USER_CR3(spillreg)                  \
 182         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 183         mov     spillreg, %cr3
 184 #endif
 185 
 186 #define PIVOT_KPTI_STK(spillreg)                \
 187         mov     %rsp, spillreg;                 \
 188         mov     %gs:CPU_KPTI_RET_RSP, %rsp;     \
 189         pushq   T_FRAMERET_SS(spillreg);        \
 190         pushq   T_FRAMERET_RSP(spillreg);       \
 191         pushq   T_FRAMERET_RFLAGS(spillreg);    \
 192         pushq   T_FRAMERET_CS(spillreg);        \
 193         pushq   T_FRAMERET_RIP(spillreg)
 194 
 195 
 196 #define INTERRUPT_TRAMPOLINE_P(errpush) \
 197         pushq   %r13;                           \
 198         pushq   %r14;                           \
 199         subq    $KPTI_R14, %rsp;                \
 200         /* Save current %cr3. */                \
 201         mov     %cr3, %r14;                     \
 202         mov     %r14, KPTI_TR_CR3(%rsp);        \
 203                                                 \
 204         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 205         je      3f;                             \
 206 1:                                              \
 207         /* Change to the "kernel" %cr3 */       \
 208         mov     KPTI_KCR3(%rsp), %r14;          \
 209         cmp     $0, %r14;                       \
 210         je      2f;                             \
 211         mov     %r14, %cr3;                     \
 212 2:                                              \
 213         /* Get our cpu_t in %r13 */             \
 214         mov     %rsp, %r13;                     \
 215         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 216         subq    $CPU_KPTI_START, %r13;          \
 217         /* Use top of the kthread stk */        \
 218         mov     CPU_THREAD(%r13), %r14;         \
 219         mov     T_STACK(%r14), %r14;            \
 220         addq    $REGSIZE+MINFRAME, %r14;        \
 221         jmp     4f;                             \
 222 3:                                              \
 223         /* Check the %rsp in the frame. */      \
 224         /* Is it above kernel base? */          \
 225         mov     kpti_kbase, %r14;               \
 226         cmp     %r14, KPTI_RSP(%rsp);           \
 227         jb      1b;                             \
 228         /* Use the %rsp from the trap frame */  \
 229         mov     KPTI_RSP(%rsp), %r14;           \
 230         and     $(~0xf), %r14;                  \
 231 4:                                              \
 232         mov     %rsp, %r13;                     \
 233         /* %r14 contains our destination stk */ \
 234         mov     %r14, %rsp;                     \
 235         pushq   KPTI_SS(%r13);                  \
 236         pushq   KPTI_RSP(%r13);                 \
 237         pushq   KPTI_RFLAGS(%r13);              \
 238         pushq   KPTI_CS(%r13);                  \
 239         pushq   KPTI_RIP(%r13);                 \
 240         errpush;                                \
 241         mov     KPTI_R14(%r13), %r14;           \
 242         mov     KPTI_R13(%r13), %r13
 243 
 244 #define INTERRUPT_TRAMPOLINE_NOERR              \
 245         INTERRUPT_TRAMPOLINE_P(/**/)
 246 
 247 #define INTERRUPT_TRAMPOLINE                    \
 248         INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 249 
 250 /*
 251  * This is used for all interrupts that can plausibly be taken inside another
 252  * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
 253  *
 254  * We also use this for #NP, even though it uses the standard IST: the
 255  * additional %rsp checks below will catch when we get an exception doing an
 256  * iret to userspace with a bad %cs/%ss.  This appears as a kernel trap, and
 257  * only later gets redirected via kern_gpfault().
 258  *
 259  * We check for whether we took the interrupt while in another trampoline, in
 260  * which case we need to use the kthread stack.
 261  */
 262 #define DBG_INTERRUPT_TRAMPOLINE_P(errpush)     \
 263         pushq   %r13;                           \
 264         pushq   %r14;                           \
 265         subq    $KPTI_R14, %rsp;                \
 266         /* Check for clobbering */              \
 267         cmp     $0, KPTI_FLAG(%rsp);            \
 268         je      1f;                             \
 269         /* Don't worry, this totally works */   \
 270         int     $8;                             \
 271 1:                                              \
 272         movq    $1, KPTI_FLAG(%rsp);            \
 273         /* Save current %cr3. */                \
 274         mov     %cr3, %r14;                     \
 275         mov     %r14, KPTI_TR_CR3(%rsp);        \
 276                                                 \
 277         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 278         je      4f;                             \
 279 2:                                              \
 280         /* Change to the "kernel" %cr3 */       \
 281         mov     KPTI_KCR3(%rsp), %r14;          \
 282         cmp     $0, %r14;                       \
 283         je      3f;                             \
 284         mov     %r14, %cr3;                     \
 285 3:                                              \
 286         /* Get our cpu_t in %r13 */             \
 287         mov     %rsp, %r13;                     \
 288         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 289         subq    $CPU_KPTI_START, %r13;          \
 290         /* Use top of the kthread stk */        \
 291         mov     CPU_THREAD(%r13), %r14;         \
 292         mov     T_STACK(%r14), %r14;            \
 293         addq    $REGSIZE+MINFRAME, %r14;        \
 294         jmp     6f;                             \
 295 4:                                              \
 296         /* Check the %rsp in the frame. */      \
 297         /* Is it above kernel base? */          \
 298         /* If not, treat as user. */            \
 299         mov     kpti_kbase, %r14;               \
 300         cmp     %r14, KPTI_RSP(%rsp);           \
 301         jb      2b;                             \
 302         /* Is it within the kpti_frame page? */ \
 303         /* If it is, treat as user interrupt */ \
 304         mov     %rsp, %r13;                     \
 305         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 306         mov     KPTI_RSP(%rsp), %r14;           \
 307         and     $(~(MMU_PAGESIZE - 1)), %r14;   \
 308         cmp     %r13, %r14;                     \
 309         je      2b;                             \
 310         /* Were we in trampoline code? */       \
 311         leaq    kpti_tramp_start, %r14;         \
 312         cmp     %r14, KPTI_RIP(%rsp);           \
 313         jb      5f;                             \
 314         leaq    kpti_tramp_end, %r14;           \
 315         cmp     %r14, KPTI_RIP(%rsp);           \
 316         ja      5f;                             \
 317         /* If we were, change %cr3: we might */ \
 318         /* have interrupted before it did. */   \
 319         mov     KPTI_KCR3(%rsp), %r14;          \
 320         mov     %r14, %cr3;                     \
 321 5:                                              \
 322         /* Use the %rsp from the trap frame */  \
 323         mov     KPTI_RSP(%rsp), %r14;           \
 324         and     $(~0xf), %r14;                  \
 325 6:                                              \
 326         mov     %rsp, %r13;                     \
 327         /* %r14 contains our destination stk */ \
 328         mov     %r14, %rsp;                     \
 329         pushq   KPTI_SS(%r13);                  \
 330         pushq   KPTI_RSP(%r13);                 \
 331         pushq   KPTI_RFLAGS(%r13);              \
 332         pushq   KPTI_CS(%r13);                  \
 333         pushq   KPTI_RIP(%r13);                 \
 334         errpush;                                \
 335         mov     KPTI_R14(%r13), %r14;           \
 336         movq    $0, KPTI_FLAG(%r13);            \
 337         mov     KPTI_R13(%r13), %r13
 338 
 339 #define DBG_INTERRUPT_TRAMPOLINE_NOERR          \
 340         DBG_INTERRUPT_TRAMPOLINE_P(/**/)
 341 
 342 #define DBG_INTERRUPT_TRAMPOLINE                \
 343         DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 344 
 345         /*
 346          * These labels (_start and _end) are used by trap.c to determine if
 347          * we took an interrupt like an NMI during the return process.
 348          */
 349 .global tr_sysc_ret_start
 350 tr_sysc_ret_start:
 351 
 352         /*
 353          * Syscall return trampolines.
 354          *
 355          * These are expected to be called on the kernel %gs. tr_sysret[ql] are
 356          * called after %rsp is changed back to the user value, so we have no
 357          * stack to work with. tr_sysexit has a kernel stack (but has to
 358          * preserve rflags, soooo).
 359          */
 360         ENTRY_NP(tr_sysretq)
 361         cmpq    $1, kpti_enable
 362         jne     1f
 363 
 364         mov     %r13, %gs:CPU_KPTI_R13
 365         SET_USER_CR3(%r13)
 366         mov     %gs:CPU_KPTI_R13, %r13
 367         /* Zero these to make sure they didn't leak from a kernel trap */
 368         movq    $0, %gs:CPU_KPTI_R13
 369         movq    $0, %gs:CPU_KPTI_R14
 370 1:
 371         swapgs
 372         sysretq
 373         SET_SIZE(tr_sysretq)
 374 
 375         ENTRY_NP(tr_sysretl)
 376         cmpq    $1, kpti_enable
 377         jne     1f
 378 
 379         mov     %r13, %gs:CPU_KPTI_R13
 380         SET_USER_CR3(%r13)
 381         mov     %gs:CPU_KPTI_R13, %r13
 382         /* Zero these to make sure they didn't leak from a kernel trap */
 383         movq    $0, %gs:CPU_KPTI_R13
 384         movq    $0, %gs:CPU_KPTI_R14
 385 1:
 386         SWAPGS
 387         SYSRETL
 388         SET_SIZE(tr_sysretl)
 389 
 390         ENTRY_NP(tr_sysexit)
 391         /*
 392          * Note: we want to preserve RFLAGS across this branch, since sysexit
 393          * (unlike sysret above) does not restore RFLAGS for us.
 394          *
 395          * We still have the real kernel stack (sysexit does restore that), so
 396          * we can use pushfq/popfq.
 397          */
 398         pushfq
 399 
 400         cmpq    $1, kpti_enable
 401         jne     1f
 402 
 403         /* Have to pop it back off now before we change %cr3! */
 404         popfq
 405         mov     %r13, %gs:CPU_KPTI_R13
 406         SET_USER_CR3(%r13)
 407         mov     %gs:CPU_KPTI_R13, %r13
 408         /* Zero these to make sure they didn't leak from a kernel trap */
 409         movq    $0, %gs:CPU_KPTI_R13
 410         movq    $0, %gs:CPU_KPTI_R14
 411         jmp     2f
 412 1:
 413         popfq
 414 2:
 415         swapgs
 416         sti
 417         sysexit
 418         SET_SIZE(tr_sysexit)
 419 
 420 .global tr_sysc_ret_end
 421 tr_sysc_ret_end:
 422 
 423         /*
 424          * Syscall entry trampolines.
 425          */
 426 
 427 #if DEBUG
 428 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 429         ENTRY_NP(tr_/**/isr);                   \
 430         swapgs;                                 \
 431         mov     %r13, %gs:CPU_KPTI_R13;         \
 432         mov     %cr3, %r13;                     \
 433         mov     %r13, %gs:CPU_KPTI_TR_CR3;      \
 434         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 435         mov     %r13, %cr3;                     \
 436         mov     %gs:CPU_KPTI_R13, %r13;         \
 437         swapgs;                                 \
 438         jmp     isr;                            \
 439         SET_SIZE(tr_/**/isr)
 440 #else
 441 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 442         ENTRY_NP(tr_/**/isr);                   \
 443         swapgs;                                 \
 444         mov     %r13, %gs:CPU_KPTI_R13;         \
 445         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 446         mov     %r13, %cr3;                     \
 447         mov     %gs:CPU_KPTI_R13, %r13;         \
 448         swapgs;                                 \
 449         jmp     isr;                            \
 450         SET_SIZE(tr_/**/isr)
 451 #endif
 452 
 453         MK_SYSCALL_TRAMPOLINE(sys_syscall)
 454         MK_SYSCALL_TRAMPOLINE(sys_syscall32)
 455         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall)
 456         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32)
 457 
 458         /*
 459          * SYSENTER is special. The CPU is really not very helpful when it
 460          * comes to preserving and restoring state with it, and as a result
 461          * we have to do all of it by hand. So, since we want to preserve
 462          * RFLAGS, we have to be very careful in these trampolines to not
 463          * clobber any bits in it. That means no cmpqs or branches!
 464          */
 465         ENTRY_NP(tr_sys_sysenter)
 466         swapgs
 467         mov     %r13, %gs:CPU_KPTI_R13
 468 #if DEBUG
 469         mov     %cr3, %r13
 470         mov     %r13, %gs:CPU_KPTI_TR_CR3
 471 #endif
 472         mov     %gs:CPU_KPTI_KCR3, %r13
 473         mov     %r13, %cr3
 474         mov     %gs:CPU_KPTI_R13, %r13
 475         jmp     _sys_sysenter_post_swapgs
 476         SET_SIZE(tr_sys_sysenter)
 477 
 478         ENTRY_NP(tr_brand_sys_sysenter)
 479         swapgs
 480         mov     %r13, %gs:CPU_KPTI_R13
 481 #if DEBUG
 482         mov     %cr3, %r13
 483         mov     %r13, %gs:CPU_KPTI_TR_CR3
 484 #endif
 485         mov     %gs:CPU_KPTI_KCR3, %r13
 486         mov     %r13, %cr3
 487         mov     %gs:CPU_KPTI_R13, %r13
 488         jmp     _brand_sys_sysenter_post_swapgs
 489         SET_SIZE(tr_brand_sys_sysenter)
 490 
 491 #define MK_SYSCALL_INT_TRAMPOLINE(isr)          \
 492         ENTRY_NP(tr_/**/isr);                   \
 493         swapgs;                                 \
 494         mov     %r13, %gs:CPU_KPTI_R13;         \
 495         SET_KERNEL_CR3(%r13);                   \
 496         mov     %gs:CPU_THREAD, %r13;           \
 497         mov     T_STACK(%r13), %r13;            \
 498         addq    $REGSIZE+MINFRAME, %r13;        \
 499         mov     %r13, %rsp;                     \
 500         pushq   %gs:CPU_KPTI_SS;                \
 501         pushq   %gs:CPU_KPTI_RSP;               \
 502         pushq   %gs:CPU_KPTI_RFLAGS;            \
 503         pushq   %gs:CPU_KPTI_CS;                \
 504         pushq   %gs:CPU_KPTI_RIP;               \
 505         mov     %gs:CPU_KPTI_R13, %r13;         \
 506         SWAPGS;                                 \
 507         jmp     isr;                            \
 508         SET_SIZE(tr_/**/isr)
 509 
 510         MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
 511         MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
 512 
 513         /*
 514          * Interrupt/trap return trampolines
 515          */
 516 
 517 .global tr_intr_ret_start
 518 tr_intr_ret_start:
 519 
 520         ENTRY_NP(tr_iret_auto)
 521         cmpq    $1, kpti_enable
 522         jne     tr_iret_kernel
 523         cmpw    $KCS_SEL, T_FRAMERET_CS(%rsp)
 524         je      tr_iret_kernel
 525         jmp     tr_iret_user
 526         SET_SIZE(tr_iret_auto)
 527 
 528         ENTRY_NP(tr_iret_kernel)
 529         /*
 530          * Yes, this does nothing extra. But this way we know if we see iret
 531          * elsewhere, then we've failed to properly consider trampolines there.
 532          */
 533         iretq
 534         SET_SIZE(tr_iret_kernel)
 535 
 536         ENTRY_NP(tr_iret_user)
 537 #if DEBUG
 538         /*
 539          * Ensure that we return to user land with CR0.TS clear. We do this
 540          * before we trampoline back and pivot the stack and %cr3. This way
 541          * we're still on the kernel stack and kernel %cr3, though we are on the
 542          * user GSBASE.
 543          */
 544         pushq   %rax
 545         mov     %cr0, %rax
 546         testq   $CR0_TS, %rax
 547         jz      1f
 548         swapgs
 549         popq    %rax
 550         leaq    _bad_ts_panic_msg(%rip), %rdi
 551         xorl    %eax, %eax
 552         pushq   %rbp
 553         movq    %rsp, %rbp
 554         call    panic
 555 1:
 556         popq    %rax
 557 #endif
 558 
 559         cmpq    $1, kpti_enable
 560         jne     1f
 561 
 562         swapgs
 563         mov     %r13, %gs:CPU_KPTI_R13
 564         PIVOT_KPTI_STK(%r13)
 565         SET_USER_CR3(%r13)
 566         mov     %gs:CPU_KPTI_R13, %r13
 567         /* Zero these to make sure they didn't leak from a kernel trap */
 568         movq    $0, %gs:CPU_KPTI_R13
 569         movq    $0, %gs:CPU_KPTI_R14
 570         swapgs
 571 1:
 572         iretq
 573         SET_SIZE(tr_iret_user)
 574 
 575         /*
 576          * This special return trampoline is for KDI's use only (with kmdb).
 577          *
 578          * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
 579          * instead. This trampoline runs after GSBASE has already been changed
 580          * back to the userland value (so we can't use %gs).
 581          *
 582          * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
 583          * The KPTI_R13 member in the kpti_dbg has already been set to what the
 584          * real %r13 should be before we IRET.
 585          *
 586          * Additionally, KDI keeps a copy of the incoming %cr3 value when it
 587          * took an interrupt, and has put that back in the kpti_dbg area for us
 588          * to use, so we don't do any sniffing of %cs here. This is important
 589          * so that debugging code that changes %cr3 is possible.
 590          */
 591         ENTRY_NP(tr_iret_kdi)
 592         movq    %r14, KPTI_R14(%r13)    /* %r14 has to be preserved by us */
 593 
 594         movq    %rsp, %r14      /* original %rsp is pointing at IRET frame */
 595         leaq    KPTI_TOP(%r13), %rsp
 596         pushq   T_FRAMERET_SS(%r14)
 597         pushq   T_FRAMERET_RSP(%r14)
 598         pushq   T_FRAMERET_RFLAGS(%r14)
 599         pushq   T_FRAMERET_CS(%r14)
 600         pushq   T_FRAMERET_RIP(%r14)
 601 
 602         movq    KPTI_TR_CR3(%r13), %r14
 603         movq    %r14, %cr3
 604 
 605         movq    KPTI_R14(%r13), %r14
 606         movq    KPTI_R13(%r13), %r13    /* preserved by our caller */
 607 
 608         iretq
 609         SET_SIZE(tr_iret_kdi)
 610 
 611 .global tr_intr_ret_end
 612 tr_intr_ret_end:
 613 
 614         /*
 615          * Interrupt/trap entry trampolines
 616          */
 617 
 618         /* CPU pushed an error code, and ISR wants one */
 619 #define MK_INTR_TRAMPOLINE(isr)                 \
 620         ENTRY_NP(tr_/**/isr);                   \
 621         INTERRUPT_TRAMPOLINE;                   \
 622         jmp     isr;                            \
 623         SET_SIZE(tr_/**/isr)
 624 
 625         /* CPU didn't push an error code, and ISR doesn't want one */
 626 #define MK_INTR_TRAMPOLINE_NOERR(isr)           \
 627         ENTRY_NP(tr_/**/isr);                   \
 628         push    $0;                             \
 629         INTERRUPT_TRAMPOLINE_NOERR;             \
 630         jmp     isr;                            \
 631         SET_SIZE(tr_/**/isr)
 632 
 633         /* CPU pushed an error code, and ISR wants one */
 634 #define MK_DBG_INTR_TRAMPOLINE(isr)     \
 635         ENTRY_NP(tr_/**/isr);                   \
 636         DBG_INTERRUPT_TRAMPOLINE;               \
 637         jmp     isr;                            \
 638         SET_SIZE(tr_/**/isr)
 639 
 640         /* CPU didn't push an error code, and ISR doesn't want one */
 641 #define MK_DBG_INTR_TRAMPOLINE_NOERR(isr)       \
 642         ENTRY_NP(tr_/**/isr);                   \
 643         push    $0;                             \
 644         DBG_INTERRUPT_TRAMPOLINE_NOERR;         \
 645         jmp     isr;                            \
 646         SET_SIZE(tr_/**/isr)
 647 
 648 
 649         MK_INTR_TRAMPOLINE_NOERR(div0trap)
 650         MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap)
 651         MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap)
 652         MK_INTR_TRAMPOLINE_NOERR(ovflotrap)
 653         MK_INTR_TRAMPOLINE_NOERR(boundstrap)
 654         MK_INTR_TRAMPOLINE_NOERR(invoptrap)
 655         MK_INTR_TRAMPOLINE_NOERR(ndptrap)
 656         MK_INTR_TRAMPOLINE(invtsstrap)
 657         MK_DBG_INTR_TRAMPOLINE(segnptrap)
 658         MK_DBG_INTR_TRAMPOLINE(stktrap)
 659         MK_DBG_INTR_TRAMPOLINE(gptrap)
 660         MK_DBG_INTR_TRAMPOLINE(pftrap)
 661         MK_INTR_TRAMPOLINE_NOERR(resvtrap)
 662         MK_INTR_TRAMPOLINE_NOERR(ndperr)
 663         MK_INTR_TRAMPOLINE(achktrap)
 664         MK_INTR_TRAMPOLINE_NOERR(xmtrap)
 665         MK_INTR_TRAMPOLINE_NOERR(invaltrap)
 666         MK_INTR_TRAMPOLINE_NOERR(fasttrap)
 667         MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
 668 
 669         /*
 670          * These are special because they can interrupt other traps, and
 671          * each other. We don't need to pivot their stacks, because they have
 672          * dedicated IST stack space, but we need to change %cr3.
 673          */
 674         ENTRY_NP(tr_nmiint)
 675         pushq   %r13
 676         mov     kpti_safe_cr3, %r13
 677         mov     %r13, %cr3
 678         popq    %r13
 679         jmp     nmiint
 680         SET_SIZE(tr_nmiint)
 681 
 682 #if !defined(__xpv)
 683         ENTRY_NP(tr_syserrtrap)
 684         /*
 685          * If we got here we should always have a zero error code pushed.
 686          * The INT $0x8 instr doesn't seem to push one, though, which we use
 687          * as an emergency panic in the other trampolines. So adjust things
 688          * here.
 689          */
 690         cmpq    $0, (%rsp)
 691         je      1f
 692         pushq   $0
 693 1:
 694         pushq   %r13
 695         mov     kpti_safe_cr3, %r13
 696         mov     %r13, %cr3
 697         popq    %r13
 698         jmp     syserrtrap
 699         SET_SIZE(tr_syserrtrap)
 700 #endif
 701 
 702         ENTRY_NP(tr_mcetrap)
 703         pushq   %r13
 704         mov     kpti_safe_cr3, %r13
 705         mov     %r13, %cr3
 706         popq    %r13
 707         jmp     mcetrap
 708         SET_SIZE(tr_mcetrap)
 709 
 710         /*
 711          * Interrupts start at 32
 712          */
 713 #define MKIVCT(n)                       \
 714         ENTRY_NP(tr_ivct/**/n)          \
 715         push    $0;                     \
 716         INTERRUPT_TRAMPOLINE;           \
 717         push    $n - 0x20;              \
 718         jmp     cmnint;                 \
 719         SET_SIZE(tr_ivct/**/n)
 720 
 721         MKIVCT(32);     MKIVCT(33);     MKIVCT(34);     MKIVCT(35);
 722         MKIVCT(36);     MKIVCT(37);     MKIVCT(38);     MKIVCT(39);
 723         MKIVCT(40);     MKIVCT(41);     MKIVCT(42);     MKIVCT(43);
 724         MKIVCT(44);     MKIVCT(45);     MKIVCT(46);     MKIVCT(47);
 725         MKIVCT(48);     MKIVCT(49);     MKIVCT(50);     MKIVCT(51);
 726         MKIVCT(52);     MKIVCT(53);     MKIVCT(54);     MKIVCT(55);
 727         MKIVCT(56);     MKIVCT(57);     MKIVCT(58);     MKIVCT(59);
 728         MKIVCT(60);     MKIVCT(61);     MKIVCT(62);     MKIVCT(63);
 729         MKIVCT(64);     MKIVCT(65);     MKIVCT(66);     MKIVCT(67);
 730         MKIVCT(68);     MKIVCT(69);     MKIVCT(70);     MKIVCT(71);
 731         MKIVCT(72);     MKIVCT(73);     MKIVCT(74);     MKIVCT(75);
 732         MKIVCT(76);     MKIVCT(77);     MKIVCT(78);     MKIVCT(79);
 733         MKIVCT(80);     MKIVCT(81);     MKIVCT(82);     MKIVCT(83);
 734         MKIVCT(84);     MKIVCT(85);     MKIVCT(86);     MKIVCT(87);
 735         MKIVCT(88);     MKIVCT(89);     MKIVCT(90);     MKIVCT(91);
 736         MKIVCT(92);     MKIVCT(93);     MKIVCT(94);     MKIVCT(95);
 737         MKIVCT(96);     MKIVCT(97);     MKIVCT(98);     MKIVCT(99);
 738         MKIVCT(100);    MKIVCT(101);    MKIVCT(102);    MKIVCT(103);
 739         MKIVCT(104);    MKIVCT(105);    MKIVCT(106);    MKIVCT(107);
 740         MKIVCT(108);    MKIVCT(109);    MKIVCT(110);    MKIVCT(111);
 741         MKIVCT(112);    MKIVCT(113);    MKIVCT(114);    MKIVCT(115);
 742         MKIVCT(116);    MKIVCT(117);    MKIVCT(118);    MKIVCT(119);
 743         MKIVCT(120);    MKIVCT(121);    MKIVCT(122);    MKIVCT(123);
 744         MKIVCT(124);    MKIVCT(125);    MKIVCT(126);    MKIVCT(127);
 745         MKIVCT(128);    MKIVCT(129);    MKIVCT(130);    MKIVCT(131);
 746         MKIVCT(132);    MKIVCT(133);    MKIVCT(134);    MKIVCT(135);
 747         MKIVCT(136);    MKIVCT(137);    MKIVCT(138);    MKIVCT(139);
 748         MKIVCT(140);    MKIVCT(141);    MKIVCT(142);    MKIVCT(143);
 749         MKIVCT(144);    MKIVCT(145);    MKIVCT(146);    MKIVCT(147);
 750         MKIVCT(148);    MKIVCT(149);    MKIVCT(150);    MKIVCT(151);
 751         MKIVCT(152);    MKIVCT(153);    MKIVCT(154);    MKIVCT(155);
 752         MKIVCT(156);    MKIVCT(157);    MKIVCT(158);    MKIVCT(159);
 753         MKIVCT(160);    MKIVCT(161);    MKIVCT(162);    MKIVCT(163);
 754         MKIVCT(164);    MKIVCT(165);    MKIVCT(166);    MKIVCT(167);
 755         MKIVCT(168);    MKIVCT(169);    MKIVCT(170);    MKIVCT(171);
 756         MKIVCT(172);    MKIVCT(173);    MKIVCT(174);    MKIVCT(175);
 757         MKIVCT(176);    MKIVCT(177);    MKIVCT(178);    MKIVCT(179);
 758         MKIVCT(180);    MKIVCT(181);    MKIVCT(182);    MKIVCT(183);
 759         MKIVCT(184);    MKIVCT(185);    MKIVCT(186);    MKIVCT(187);
 760         MKIVCT(188);    MKIVCT(189);    MKIVCT(190);    MKIVCT(191);
 761         MKIVCT(192);    MKIVCT(193);    MKIVCT(194);    MKIVCT(195);
 762         MKIVCT(196);    MKIVCT(197);    MKIVCT(198);    MKIVCT(199);
 763         MKIVCT(200);    MKIVCT(201);    MKIVCT(202);    MKIVCT(203);
 764         MKIVCT(204);    MKIVCT(205);    MKIVCT(206);    MKIVCT(207);
 765         MKIVCT(208);    MKIVCT(209);    MKIVCT(210);    MKIVCT(211);
 766         MKIVCT(212);    MKIVCT(213);    MKIVCT(214);    MKIVCT(215);
 767         MKIVCT(216);    MKIVCT(217);    MKIVCT(218);    MKIVCT(219);
 768         MKIVCT(220);    MKIVCT(221);    MKIVCT(222);    MKIVCT(223);
 769         MKIVCT(224);    MKIVCT(225);    MKIVCT(226);    MKIVCT(227);
 770         MKIVCT(228);    MKIVCT(229);    MKIVCT(230);    MKIVCT(231);
 771         MKIVCT(232);    MKIVCT(233);    MKIVCT(234);    MKIVCT(235);
 772         MKIVCT(236);    MKIVCT(237);    MKIVCT(238);    MKIVCT(239);
 773         MKIVCT(240);    MKIVCT(241);    MKIVCT(242);    MKIVCT(243);
 774         MKIVCT(244);    MKIVCT(245);    MKIVCT(246);    MKIVCT(247);
 775         MKIVCT(248);    MKIVCT(249);    MKIVCT(250);    MKIVCT(251);
 776         MKIVCT(252);    MKIVCT(253);    MKIVCT(254);    MKIVCT(255);
 777 
 778         /*
 779          * We're PCIDE, but we don't have INVPCID.  The only way to invalidate a
 780          * PCID other than the current one, then, is to load its cr3 then
 781          * invlpg.  But loading kf_user_cr3 means we can longer access our
 782          * caller's text mapping (or indeed, its stack).  So this little helper
 783          * has to live within our trampoline text region.
 784          *
 785          * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3)
 786          */
 787         ENTRY_NP(tr_mmu_flush_user_range)
 788         push    %rbx
 789         /* When we read cr3, it never has the NOINVL bit set. */
 790         mov     %cr3, %rax
 791         movq    $CR3_NOINVL_BIT, %rbx
 792         orq     %rbx, %rax
 793 
 794         mov     %rcx, %cr3
 795         add     %rdi, %rsi
 796 .align  ASM_ENTRY_ALIGN
 797 1:
 798         invlpg  (%rdi)
 799         add     %rdx, %rdi
 800         cmp     %rsi, %rdi
 801         jb      1b
 802         mov     %rax, %cr3
 803         pop     %rbx
 804         retq
 805         SET_SIZE(tr_mmu_flush_user_range)
 806 
 807 .align MMU_PAGESIZE
 808 .global kpti_tramp_end
 809 kpti_tramp_end:
 810         nop
 811 
 812 #endif  /* __lint */