1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /*
  12  * Copyright 2019 Joyent, Inc.
  13  */
  14 
  15 /*
  16  * This file contains the trampolines that are used by KPTI in order to be
  17  * able to take interrupts/trap/etc while on the "user" page table.
  18  *
  19  * We don't map the full kernel text into the user page table: instead we
  20  * map this one small section of trampolines (which compiles to ~13 pages).
  21  * These trampolines are set in the IDT always (so they will run no matter
  22  * whether we're on the kernel or user page table), and their primary job is to
  23  * pivot us to the kernel %cr3 and %rsp without ruining everything.
  24  *
  25  * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
  26  * meaning that they will execute with their %rsp set to a known location, even
  27  * if we take them in the kernel.
  28  *
  29  * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
  30  * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
  31  * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
  32  * page-aligned, and we map the page it's on into both page tables. Using a
  33  * struct attached to the cpu_t also means that we can use %rsp-relative
  34  * addressing to find anything on the cpu_t, so we don't have to touch %gs or
  35  * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy).
  36  *
  37  * This little struct is where the CPU will push the actual interrupt frame.
  38  * Then, in the trampoline, we change %cr3, then figure out our destination
  39  * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt
  40  * frame). Then we jump to the regular ISR in the kernel text and carry on as
  41  * normal.
  42  *
  43  * We leave the original frame and any spilled regs behind in the kpti_frame
  44  * lazily until we want to return to userland. Then, we clear any spilled
  45  * regs from it, and overwrite the rest with our iret frame. When switching
  46  * this cpu to a different process (in hat_switch), we bzero the whole region to
  47  * make sure nothing can leak between processes.
  48  *
  49  * When we're returning back to the original place we took the interrupt later
  50  * (especially if it was in userland), we have to jmp back to the "return
  51  * trampolines" here, since when we set %cr3 back to the user value, we need to
  52  * be executing from code here in these shared pages and not the main kernel
  53  * text again. Even though it should be fine to iret directly from kernel text
  54  * when returning to kernel code, we make things jmp to a trampoline here just
  55  * for consistency.
  56  *
  57  * Note that with IST, it's very important that we always must have pivoted
  58  * away from the IST stack before we could possibly take any other interrupt
  59  * on the same IST (unless it's an end-of-the-world fault and we don't care
  60  * about coming back from it ever).
  61  *
  62  * This is particularly relevant to the dbgtrap/brktrap trampolines, as they
  63  * regularly have to happen from within trampoline code (e.g. in the sysenter
  64  * single-step case) and then return to the world normally. As a result, these
  65  * two are IST'd to their own kpti_frame right above the normal one (in the same
  66  * page), so they don't clobber their parent interrupt.
  67  *
  68  * To aid with debugging, we also IST the page fault (#PF/pftrap), general
  69  * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to
  70  * their own separate kpti_frame. This ensures that if we take one of these
  71  * due to a bug in trampoline code, we preserve the original trampoline
  72  * state that caused the trap.
  73  *
  74  * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
  75  * stacks, since they can interrupt another ISR at any time. These stacks are
  76  * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
  77  * their trampolines (and do it unconditionally), and don't bother pivoting
  78  * away. We're either going into the panic() path, or we're going to return
  79  * straight away without rescheduling, so it's fine to not be on our real
  80  * kthread stack (and some of the state we want to go find it with might be
  81  * corrupt!)
  82  *
  83  * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
  84  * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
  85  * point at the PML4 for kas early in boot and never touch it again. Hopefully
  86  * it survives whatever corruption brings down the rest of the kernel!
  87  *
  88  * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
  89  * cases) in that they do not push an interrupt frame (and also have some other
  90  * effects). In the syscall trampolines, we assume that we can only be taking
  91  * the call from userland and use swapgs and an unconditional overwrite of %cr3.
  92  * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
  93  * existing %rsp pivot untouched) -- instead we spill registers into
  94  * %gs:CPU_KPTI_* as we need to.
  95  *
  96  * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
  97  * hat_switch().
  98  */
  99 
 100 /*
 101  * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
 102  * fix bugs here check to see if they should be fixed there as well.
 103  */
 104 
 105 #include <sys/asm_linkage.h>
 106 #include <sys/asm_misc.h>
 107 #include <sys/regset.h>
 108 #include <sys/privregs.h>
 109 #include <sys/psw.h>
 110 #include <sys/machbrand.h>
 111 #include <sys/param.h>
 112 
 113 #if defined(__lint)
 114 
 115 #include <sys/types.h>
 116 #include <sys/thread.h>
 117 #include <sys/systm.h>
 118 
 119 #else   /* __lint */
 120 
 121 #include <sys/segments.h>
 122 #include <sys/pcb.h>
 123 #include <sys/trap.h>
 124 #include <sys/ftrace.h>
 125 #include <sys/traptrace.h>
 126 #include <sys/clock.h>
 127 #include <sys/model.h>
 128 #include <sys/panic.h>
 129 
 130 #if defined(__xpv)
 131 #include <sys/hypervisor.h>
 132 #endif
 133 
 134 #include "assym.h"
 135 
 136         .data
 137         DGDEF3(kpti_enable, 8, 8)
 138         .fill   1, 8, 1
 139 
 140 #if DEBUG
 141         .data
 142 _bad_ts_panic_msg:
 143         .string "kpti_trampolines.s: tr_iret_user but CR0.TS set"
 144 #endif
 145 
 146 .section ".text";
 147 .align MMU_PAGESIZE
 148 
 149 .global kpti_tramp_start
 150 kpti_tramp_start:
 151         nop
 152 
 153 /* This will be set by mlsetup, and then double-checked later */
 154 .global kpti_safe_cr3
 155 kpti_safe_cr3:
 156         .quad 0
 157         SET_SIZE(kpti_safe_cr3)
 158 
 159 /* startup_kmem() will overwrite this */
 160 .global kpti_kbase
 161 kpti_kbase:
 162         .quad KERNELBASE
 163         SET_SIZE(kpti_kbase)
 164 
 165 #define SET_KERNEL_CR3(spillreg)                \
 166         mov     %cr3, spillreg;                 \
 167         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 168         mov     %gs:CPU_KPTI_KCR3, spillreg;    \
 169         cmp     $0, spillreg;                   \
 170         je      2f;                             \
 171         mov     spillreg, %cr3;                 \
 172 2:
 173 
 174 #if DEBUG
 175 #define SET_USER_CR3(spillreg)                  \
 176         mov     %cr3, spillreg;                 \
 177         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 178         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 179         mov     spillreg, %cr3
 180 #else
 181 #define SET_USER_CR3(spillreg)                  \
 182         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 183         mov     spillreg, %cr3
 184 #endif
 185 
 186 #define PIVOT_KPTI_STK(spillreg)                \
 187         mov     %rsp, spillreg;                 \
 188         mov     %gs:CPU_KPTI_RET_RSP, %rsp;     \
 189         pushq   T_FRAMERET_SS(spillreg);        \
 190         pushq   T_FRAMERET_RSP(spillreg);       \
 191         pushq   T_FRAMERET_RFLAGS(spillreg);    \
 192         pushq   T_FRAMERET_CS(spillreg);        \
 193         pushq   T_FRAMERET_RIP(spillreg)
 194 
 195 
 196 #define INTERRUPT_TRAMPOLINE_P(errpush) \
 197         pushq   %r13;                           \
 198         pushq   %r14;                           \
 199         subq    $KPTI_R14, %rsp;                \
 200         /* Save current %cr3. */                \
 201         mov     %cr3, %r14;                     \
 202         mov     %r14, KPTI_TR_CR3(%rsp);        \
 203                                                 \
 204         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 205         je      3f;                             \
 206 1:                                              \
 207         /* Change to the "kernel" %cr3 */       \
 208         mov     KPTI_KCR3(%rsp), %r14;          \
 209         cmp     $0, %r14;                       \
 210         je      2f;                             \
 211         mov     %r14, %cr3;                     \
 212 2:                                              \
 213         /* Get our cpu_t in %r13 */             \
 214         mov     %rsp, %r13;                     \
 215         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 216         subq    $CPU_KPTI_START, %r13;          \
 217         /* Use top of the kthread stk */        \
 218         mov     CPU_THREAD(%r13), %r14;         \
 219         mov     T_STACK(%r14), %r14;            \
 220         addq    $REGSIZE+MINFRAME, %r14;        \
 221         jmp     4f;                             \
 222 3:                                              \
 223         /* Check the %rsp in the frame. */      \
 224         /* Is it above kernel base? */          \
 225         mov     kpti_kbase, %r14;               \
 226         cmp     %r14, KPTI_RSP(%rsp);           \
 227         jb      1b;                             \
 228         /* Use the %rsp from the trap frame */  \
 229         mov     KPTI_RSP(%rsp), %r14;           \
 230         and     $(~0xf), %r14;                  \
 231 4:                                              \
 232         mov     %rsp, %r13;                     \
 233         /* %r14 contains our destination stk */ \
 234         mov     %r14, %rsp;                     \
 235         pushq   KPTI_SS(%r13);                  \
 236         pushq   KPTI_RSP(%r13);                 \
 237         pushq   KPTI_RFLAGS(%r13);              \
 238         pushq   KPTI_CS(%r13);                  \
 239         pushq   KPTI_RIP(%r13);                 \
 240         errpush;                                \
 241         mov     KPTI_R14(%r13), %r14;           \
 242         mov     KPTI_R13(%r13), %r13
 243 
 244 #define INTERRUPT_TRAMPOLINE_NOERR              \
 245         INTERRUPT_TRAMPOLINE_P(/**/)
 246 
 247 #define INTERRUPT_TRAMPOLINE                    \
 248         INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 249 
 250 /*
 251  * This is used for all interrupts that can plausibly be taken inside another
 252  * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
 253  *
 254  * We also use this for #NP, even though it uses the standard IST: the
 255  * additional %rsp checks below will catch when we get an exception doing an
 256  * iret to userspace with a bad %cs/%ss.  This appears as a kernel trap, and
 257  * only later gets redirected via kern_gpfault().
 258  *
 259  * We check for whether we took the interrupt while in another trampoline, in
 260  * which case we need to use the kthread stack.
 261  */
 262 #define DBG_INTERRUPT_TRAMPOLINE_P(errpush)     \
 263         pushq   %r13;                           \
 264         pushq   %r14;                           \
 265         subq    $KPTI_R14, %rsp;                \
 266         /* Check for clobbering */              \
 267         cmp     $0, KPTI_FLAG(%rsp);            \
 268         je      1f;                             \
 269         /* Don't worry, this totally works */   \
 270         int     $8;                             \
 271 1:                                              \
 272         movq    $1, KPTI_FLAG(%rsp);            \
 273         /* Save current %cr3. */                \
 274         mov     %cr3, %r14;                     \
 275         mov     %r14, KPTI_TR_CR3(%rsp);        \
 276                                                 \
 277         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 278         je      4f;                             \
 279 2:                                              \
 280         /* Change to the "kernel" %cr3 */       \
 281         mov     KPTI_KCR3(%rsp), %r14;          \
 282         cmp     $0, %r14;                       \
 283         je      3f;                             \
 284         mov     %r14, %cr3;                     \
 285 3:                                              \
 286         /* Get our cpu_t in %r13 */             \
 287         mov     %rsp, %r13;                     \
 288         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 289         subq    $CPU_KPTI_START, %r13;          \
 290         /* Use top of the kthread stk */        \
 291         mov     CPU_THREAD(%r13), %r14;         \
 292         mov     T_STACK(%r14), %r14;            \
 293         addq    $REGSIZE+MINFRAME, %r14;        \
 294         jmp     6f;                             \
 295 4:                                              \
 296         /* Check the %rsp in the frame. */      \
 297         /* Is it above kernel base? */          \
 298         /* If not, treat as user. */            \
 299         mov     kpti_kbase, %r14;               \
 300         cmp     %r14, KPTI_RSP(%rsp);           \
 301         jb      2b;                             \
 302         /* Is it within the kpti_frame page? */ \
 303         /* If it is, treat as user interrupt */ \
 304         mov     %rsp, %r13;                     \
 305         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 306         mov     KPTI_RSP(%rsp), %r14;           \
 307         and     $(~(MMU_PAGESIZE - 1)), %r14;   \
 308         cmp     %r13, %r14;                     \
 309         je      2b;                             \
 310         /* Were we in trampoline code? */       \
 311         leaq    kpti_tramp_start, %r14;         \
 312         cmp     %r14, KPTI_RIP(%rsp);           \
 313         jb      5f;                             \
 314         leaq    kpti_tramp_end, %r14;           \
 315         cmp     %r14, KPTI_RIP(%rsp);           \
 316         ja      5f;                             \
 317         /* If we were, change %cr3: we might */ \
 318         /* have interrupted before it did. */   \
 319         mov     KPTI_KCR3(%rsp), %r14;          \
 320         mov     %r14, %cr3;                     \
 321 5:                                              \
 322         /* Use the %rsp from the trap frame */  \
 323         mov     KPTI_RSP(%rsp), %r14;           \
 324         and     $(~0xf), %r14;                  \
 325 6:                                              \
 326         mov     %rsp, %r13;                     \
 327         /* %r14 contains our destination stk */ \
 328         mov     %r14, %rsp;                     \
 329         pushq   KPTI_SS(%r13);                  \
 330         pushq   KPTI_RSP(%r13);                 \
 331         pushq   KPTI_RFLAGS(%r13);              \
 332         pushq   KPTI_CS(%r13);                  \
 333         pushq   KPTI_RIP(%r13);                 \
 334         errpush;                                \
 335         mov     KPTI_R14(%r13), %r14;           \
 336         movq    $0, KPTI_FLAG(%r13);            \
 337         mov     KPTI_R13(%r13), %r13
 338 
 339 #define DBG_INTERRUPT_TRAMPOLINE_NOERR          \
 340         DBG_INTERRUPT_TRAMPOLINE_P(/**/)
 341 
 342 #define DBG_INTERRUPT_TRAMPOLINE                \
 343         DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 344 
 345         /*
 346          * These labels (_start and _end) are used by trap.c to determine if
 347          * we took an interrupt like an NMI during the return process.
 348          */
 349 .global tr_sysc_ret_start
 350 tr_sysc_ret_start:
 351 
 352         /*
 353          * Syscall return trampolines.
 354          *
 355          * These are expected to be called on the kernel %gs. tr_sysret[ql] are
 356          * called after %rsp is changed back to the user value, so we have no
 357          * stack to work with. tr_sysexit has a kernel stack (but has to
 358          * preserve rflags, soooo).
 359          */
 360         ENTRY_NP(tr_sysretq)
 361         cmpq    $1, kpti_enable
 362         jne     1f
 363 
 364         mov     %r13, %gs:CPU_KPTI_R13
 365         SET_USER_CR3(%r13)
 366         mov     %gs:CPU_KPTI_R13, %r13
 367         /* Zero these to make sure they didn't leak from a kernel trap */
 368         movq    $0, %gs:CPU_KPTI_R13
 369         movq    $0, %gs:CPU_KPTI_R14
 370 1:
 371         swapgs
 372         sysretq
 373         SET_SIZE(tr_sysretq)
 374 
 375         ENTRY_NP(tr_sysretl)
 376         cmpq    $1, kpti_enable
 377         jne     1f
 378 
 379         mov     %r13, %gs:CPU_KPTI_R13
 380         SET_USER_CR3(%r13)
 381         mov     %gs:CPU_KPTI_R13, %r13
 382         /* Zero these to make sure they didn't leak from a kernel trap */
 383         movq    $0, %gs:CPU_KPTI_R13
 384         movq    $0, %gs:CPU_KPTI_R14
 385 1:
 386         SWAPGS
 387         SYSRETL
 388         SET_SIZE(tr_sysretl)
 389 
 390         ENTRY_NP(tr_sysexit)
 391         /*
 392          * Note: we want to preserve RFLAGS across this branch, since sysexit
 393          * (unlike sysret above) does not restore RFLAGS for us.
 394          *
 395          * We still have the real kernel stack (sysexit does restore that), so
 396          * we can use pushfq/popfq.
 397          */
 398         pushfq
 399 
 400         cmpq    $1, kpti_enable
 401         jne     1f
 402 
 403         /* Have to pop it back off now before we change %cr3! */
 404         popfq
 405         mov     %r13, %gs:CPU_KPTI_R13
 406         SET_USER_CR3(%r13)
 407         mov     %gs:CPU_KPTI_R13, %r13
 408         /* Zero these to make sure they didn't leak from a kernel trap */
 409         movq    $0, %gs:CPU_KPTI_R13
 410         movq    $0, %gs:CPU_KPTI_R14
 411         jmp     2f
 412 1:
 413         popfq
 414 2:
 415         swapgs
 416         sti
 417         sysexit
 418         SET_SIZE(tr_sysexit)
 419 
 420 .global tr_sysc_ret_end
 421 tr_sysc_ret_end:
 422 
 423         /*
 424          * Syscall entry trampolines.
 425          */
 426 
 427 #if DEBUG
 428 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 429         ENTRY_NP(tr_/**/isr);                   \
 430         swapgs;                                 \
 431         mov     %r13, %gs:CPU_KPTI_R13;         \
 432         mov     %cr3, %r13;                     \
 433         mov     %r13, %gs:CPU_KPTI_TR_CR3;      \
 434         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 435         mov     %r13, %cr3;                     \
 436         mov     %gs:CPU_KPTI_R13, %r13;         \
 437         swapgs;                                 \
 438         jmp     isr;                            \
 439         SET_SIZE(tr_/**/isr)
 440 #else
 441 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 442         ENTRY_NP(tr_/**/isr);                   \
 443         swapgs;                                 \
 444         mov     %r13, %gs:CPU_KPTI_R13;         \
 445         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 446         mov     %r13, %cr3;                     \
 447         mov     %gs:CPU_KPTI_R13, %r13;         \
 448         swapgs;                                 \
 449         jmp     isr;                            \
 450         SET_SIZE(tr_/**/isr)
 451 #endif
 452 
 453         MK_SYSCALL_TRAMPOLINE(sys_syscall)
 454         MK_SYSCALL_TRAMPOLINE(sys_syscall32)
 455         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall)
 456         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32)
 457 
 458         /*
 459          * SYSENTER is special. The CPU is really not very helpful when it
 460          * comes to preserving and restoring state with it, and as a result
 461          * we have to do all of it by hand. So, since we want to preserve
 462          * RFLAGS, we have to be very careful in these trampolines to not
 463          * clobber any bits in it. That means no cmpqs or branches!
 464          */
 465         ENTRY_NP(tr_sys_sysenter)
 466         swapgs
 467         mov     %r13, %gs:CPU_KPTI_R13
 468 #if DEBUG
 469         mov     %cr3, %r13
 470         mov     %r13, %gs:CPU_KPTI_TR_CR3
 471 #endif
 472         mov     %gs:CPU_KPTI_KCR3, %r13
 473         mov     %r13, %cr3
 474         mov     %gs:CPU_KPTI_R13, %r13
 475         jmp     _sys_sysenter_post_swapgs
 476         SET_SIZE(tr_sys_sysenter)
 477 
 478         ENTRY_NP(tr_brand_sys_sysenter)
 479         swapgs
 480         mov     %r13, %gs:CPU_KPTI_R13
 481 #if DEBUG
 482         mov     %cr3, %r13
 483         mov     %r13, %gs:CPU_KPTI_TR_CR3
 484 #endif
 485         mov     %gs:CPU_KPTI_KCR3, %r13
 486         mov     %r13, %cr3
 487         mov     %gs:CPU_KPTI_R13, %r13
 488         jmp     _brand_sys_sysenter_post_swapgs
 489         SET_SIZE(tr_brand_sys_sysenter)
 490 
 491 #define MK_SYSCALL_INT_TRAMPOLINE(isr)          \
 492         ENTRY_NP(tr_/**/isr);                   \
 493         swapgs;                                 \
 494         mov     %r13, %gs:CPU_KPTI_R13;         \
 495         SET_KERNEL_CR3(%r13);                   \
 496         mov     %gs:CPU_THREAD, %r13;           \
 497         mov     T_STACK(%r13), %r13;            \
 498         addq    $REGSIZE+MINFRAME, %r13;        \
 499         mov     %r13, %rsp;                     \
 500         pushq   %gs:CPU_KPTI_SS;                \
 501         pushq   %gs:CPU_KPTI_RSP;               \
 502         pushq   %gs:CPU_KPTI_RFLAGS;            \
 503         pushq   %gs:CPU_KPTI_CS;                \
 504         pushq   %gs:CPU_KPTI_RIP;               \
 505         mov     %gs:CPU_KPTI_R13, %r13;         \
 506         swapgs;                                 \
 507         jmp     isr;                            \
 508         SET_SIZE(tr_/**/isr)
 509 
 510         MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
 511         MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
 512 
 513         /*
 514          * Interrupt/trap return trampolines
 515          */
 516 
 517 .global tr_intr_ret_start
 518 tr_intr_ret_start:
 519 
 520         ENTRY_NP(tr_iret_auto)
 521         cmpq    $1, kpti_enable
 522         jne     tr_iret_kernel
 523         cmpw    $KCS_SEL, T_FRAMERET_CS(%rsp)
 524         je      tr_iret_kernel
 525         jmp     tr_iret_user
 526         SET_SIZE(tr_iret_auto)
 527 
 528         ENTRY_NP(tr_iret_kernel)
 529         /*
 530          * Yes, this does nothing extra. But this way we know if we see iret
 531          * elsewhere, then we've failed to properly consider trampolines there.
 532          */
 533         iretq
 534         SET_SIZE(tr_iret_kernel)
 535 
 536         ENTRY_NP(tr_iret_user)
 537 #if DEBUG
 538         /*
 539          * Panic if we find CR0.TS set. We're still on the kernel stack and
 540          * %cr3, but we do need to swap back to the kernel gs. (We don't worry
 541          * about swapgs speculation here.)
 542          */
 543         pushq   %rax
 544         mov     %cr0, %rax
 545         testq   $CR0_TS, %rax
 546         jz      1f
 547         swapgs
 548         popq    %rax
 549         leaq    _bad_ts_panic_msg(%rip), %rdi
 550         xorl    %eax, %eax
 551         pushq   %rbp
 552         movq    %rsp, %rbp
 553         call    panic
 554 1:
 555         popq    %rax
 556 #endif
 557 
 558         cmpq    $1, kpti_enable
 559         jne     1f
 560 
 561         /*
 562          * KPTI enabled: we're on the user gsbase at this point, so we
 563          * need to swap back so we can pivot stacks.
 564          *
 565          * The swapgs lfence mitigation is probably not needed here
 566          * since a mis-speculation of the above branch would imply KPTI
 567          * is disabled, but we'll do so anyway.
 568          */
 569         swapgs
 570         lfence
 571         mov     %r13, %gs:CPU_KPTI_R13
 572         PIVOT_KPTI_STK(%r13)
 573         SET_USER_CR3(%r13)
 574         mov     %gs:CPU_KPTI_R13, %r13
 575         /* Zero these to make sure they didn't leak from a kernel trap. */
 576         movq    $0, %gs:CPU_KPTI_R13
 577         movq    $0, %gs:CPU_KPTI_R14
 578         /* And back to user gsbase again. */
 579         swapgs
 580 1:
 581         iretq
 582         SET_SIZE(tr_iret_user)
 583 
 584         /*
 585          * This special return trampoline is for KDI's use only (with kmdb).
 586          *
 587          * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
 588          * instead. This trampoline runs after GSBASE has already been changed
 589          * back to the userland value (so we can't use %gs).
 590          *
 591          * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
 592          * The KPTI_R13 member in the kpti_dbg has already been set to what the
 593          * real %r13 should be before we IRET.
 594          *
 595          * Additionally, KDI keeps a copy of the incoming %cr3 value when it
 596          * took an interrupt, and has put that back in the kpti_dbg area for us
 597          * to use, so we don't do any sniffing of %cs here. This is important
 598          * so that debugging code that changes %cr3 is possible.
 599          */
 600         ENTRY_NP(tr_iret_kdi)
 601         movq    %r14, KPTI_R14(%r13)    /* %r14 has to be preserved by us */
 602 
 603         movq    %rsp, %r14      /* original %rsp is pointing at IRET frame */
 604         leaq    KPTI_TOP(%r13), %rsp
 605         pushq   T_FRAMERET_SS(%r14)
 606         pushq   T_FRAMERET_RSP(%r14)
 607         pushq   T_FRAMERET_RFLAGS(%r14)
 608         pushq   T_FRAMERET_CS(%r14)
 609         pushq   T_FRAMERET_RIP(%r14)
 610 
 611         movq    KPTI_TR_CR3(%r13), %r14
 612         movq    %r14, %cr3
 613 
 614         movq    KPTI_R14(%r13), %r14
 615         movq    KPTI_R13(%r13), %r13    /* preserved by our caller */
 616 
 617         iretq
 618         SET_SIZE(tr_iret_kdi)
 619 
 620 .global tr_intr_ret_end
 621 tr_intr_ret_end:
 622 
 623         /*
 624          * Interrupt/trap entry trampolines
 625          */
 626 
 627         /* CPU pushed an error code, and ISR wants one */
 628 #define MK_INTR_TRAMPOLINE(isr)                 \
 629         ENTRY_NP(tr_/**/isr);                   \
 630         INTERRUPT_TRAMPOLINE;                   \
 631         jmp     isr;                            \
 632         SET_SIZE(tr_/**/isr)
 633 
 634         /* CPU didn't push an error code, and ISR doesn't want one */
 635 #define MK_INTR_TRAMPOLINE_NOERR(isr)           \
 636         ENTRY_NP(tr_/**/isr);                   \
 637         push    $0;                             \
 638         INTERRUPT_TRAMPOLINE_NOERR;             \
 639         jmp     isr;                            \
 640         SET_SIZE(tr_/**/isr)
 641 
 642         /* CPU pushed an error code, and ISR wants one */
 643 #define MK_DBG_INTR_TRAMPOLINE(isr)     \
 644         ENTRY_NP(tr_/**/isr);                   \
 645         DBG_INTERRUPT_TRAMPOLINE;               \
 646         jmp     isr;                            \
 647         SET_SIZE(tr_/**/isr)
 648 
 649         /* CPU didn't push an error code, and ISR doesn't want one */
 650 #define MK_DBG_INTR_TRAMPOLINE_NOERR(isr)       \
 651         ENTRY_NP(tr_/**/isr);                   \
 652         push    $0;                             \
 653         DBG_INTERRUPT_TRAMPOLINE_NOERR;         \
 654         jmp     isr;                            \
 655         SET_SIZE(tr_/**/isr)
 656 
 657 
 658         MK_INTR_TRAMPOLINE_NOERR(div0trap)
 659         MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap)
 660         MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap)
 661         MK_INTR_TRAMPOLINE_NOERR(ovflotrap)
 662         MK_INTR_TRAMPOLINE_NOERR(boundstrap)
 663         MK_INTR_TRAMPOLINE_NOERR(invoptrap)
 664         MK_INTR_TRAMPOLINE_NOERR(ndptrap)
 665         MK_INTR_TRAMPOLINE(invtsstrap)
 666         MK_DBG_INTR_TRAMPOLINE(segnptrap)
 667         MK_DBG_INTR_TRAMPOLINE(stktrap)
 668         MK_DBG_INTR_TRAMPOLINE(gptrap)
 669         MK_DBG_INTR_TRAMPOLINE(pftrap)
 670         MK_INTR_TRAMPOLINE_NOERR(resvtrap)
 671         MK_INTR_TRAMPOLINE_NOERR(ndperr)
 672         MK_INTR_TRAMPOLINE(achktrap)
 673         MK_INTR_TRAMPOLINE_NOERR(xmtrap)
 674         MK_INTR_TRAMPOLINE_NOERR(invaltrap)
 675         MK_INTR_TRAMPOLINE_NOERR(fasttrap)
 676         MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
 677 
 678         /*
 679          * These are special because they can interrupt other traps, and
 680          * each other. We don't need to pivot their stacks, because they have
 681          * dedicated IST stack space, but we need to change %cr3.
 682          */
 683         ENTRY_NP(tr_nmiint)
 684         pushq   %r13
 685         mov     kpti_safe_cr3, %r13
 686         mov     %r13, %cr3
 687         popq    %r13
 688         jmp     nmiint
 689         SET_SIZE(tr_nmiint)
 690 
 691 #if !defined(__xpv)
 692         ENTRY_NP(tr_syserrtrap)
 693         /*
 694          * If we got here we should always have a zero error code pushed.
 695          * The INT $0x8 instr doesn't seem to push one, though, which we use
 696          * as an emergency panic in the other trampolines. So adjust things
 697          * here.
 698          */
 699         cmpq    $0, (%rsp)
 700         je      1f
 701         pushq   $0
 702 1:
 703         pushq   %r13
 704         mov     kpti_safe_cr3, %r13
 705         mov     %r13, %cr3
 706         popq    %r13
 707         jmp     syserrtrap
 708         SET_SIZE(tr_syserrtrap)
 709 #endif
 710 
 711         ENTRY_NP(tr_mcetrap)
 712         pushq   %r13
 713         mov     kpti_safe_cr3, %r13
 714         mov     %r13, %cr3
 715         popq    %r13
 716         jmp     mcetrap
 717         SET_SIZE(tr_mcetrap)
 718 
 719         /*
 720          * Interrupts start at 32
 721          */
 722 #define MKIVCT(n)                       \
 723         ENTRY_NP(tr_ivct/**/n)          \
 724         push    $0;                     \
 725         INTERRUPT_TRAMPOLINE;           \
 726         push    $n - 0x20;              \
 727         jmp     cmnint;                 \
 728         SET_SIZE(tr_ivct/**/n)
 729 
 730         MKIVCT(32);     MKIVCT(33);     MKIVCT(34);     MKIVCT(35);
 731         MKIVCT(36);     MKIVCT(37);     MKIVCT(38);     MKIVCT(39);
 732         MKIVCT(40);     MKIVCT(41);     MKIVCT(42);     MKIVCT(43);
 733         MKIVCT(44);     MKIVCT(45);     MKIVCT(46);     MKIVCT(47);
 734         MKIVCT(48);     MKIVCT(49);     MKIVCT(50);     MKIVCT(51);
 735         MKIVCT(52);     MKIVCT(53);     MKIVCT(54);     MKIVCT(55);
 736         MKIVCT(56);     MKIVCT(57);     MKIVCT(58);     MKIVCT(59);
 737         MKIVCT(60);     MKIVCT(61);     MKIVCT(62);     MKIVCT(63);
 738         MKIVCT(64);     MKIVCT(65);     MKIVCT(66);     MKIVCT(67);
 739         MKIVCT(68);     MKIVCT(69);     MKIVCT(70);     MKIVCT(71);
 740         MKIVCT(72);     MKIVCT(73);     MKIVCT(74);     MKIVCT(75);
 741         MKIVCT(76);     MKIVCT(77);     MKIVCT(78);     MKIVCT(79);
 742         MKIVCT(80);     MKIVCT(81);     MKIVCT(82);     MKIVCT(83);
 743         MKIVCT(84);     MKIVCT(85);     MKIVCT(86);     MKIVCT(87);
 744         MKIVCT(88);     MKIVCT(89);     MKIVCT(90);     MKIVCT(91);
 745         MKIVCT(92);     MKIVCT(93);     MKIVCT(94);     MKIVCT(95);
 746         MKIVCT(96);     MKIVCT(97);     MKIVCT(98);     MKIVCT(99);
 747         MKIVCT(100);    MKIVCT(101);    MKIVCT(102);    MKIVCT(103);
 748         MKIVCT(104);    MKIVCT(105);    MKIVCT(106);    MKIVCT(107);
 749         MKIVCT(108);    MKIVCT(109);    MKIVCT(110);    MKIVCT(111);
 750         MKIVCT(112);    MKIVCT(113);    MKIVCT(114);    MKIVCT(115);
 751         MKIVCT(116);    MKIVCT(117);    MKIVCT(118);    MKIVCT(119);
 752         MKIVCT(120);    MKIVCT(121);    MKIVCT(122);    MKIVCT(123);
 753         MKIVCT(124);    MKIVCT(125);    MKIVCT(126);    MKIVCT(127);
 754         MKIVCT(128);    MKIVCT(129);    MKIVCT(130);    MKIVCT(131);
 755         MKIVCT(132);    MKIVCT(133);    MKIVCT(134);    MKIVCT(135);
 756         MKIVCT(136);    MKIVCT(137);    MKIVCT(138);    MKIVCT(139);
 757         MKIVCT(140);    MKIVCT(141);    MKIVCT(142);    MKIVCT(143);
 758         MKIVCT(144);    MKIVCT(145);    MKIVCT(146);    MKIVCT(147);
 759         MKIVCT(148);    MKIVCT(149);    MKIVCT(150);    MKIVCT(151);
 760         MKIVCT(152);    MKIVCT(153);    MKIVCT(154);    MKIVCT(155);
 761         MKIVCT(156);    MKIVCT(157);    MKIVCT(158);    MKIVCT(159);
 762         MKIVCT(160);    MKIVCT(161);    MKIVCT(162);    MKIVCT(163);
 763         MKIVCT(164);    MKIVCT(165);    MKIVCT(166);    MKIVCT(167);
 764         MKIVCT(168);    MKIVCT(169);    MKIVCT(170);    MKIVCT(171);
 765         MKIVCT(172);    MKIVCT(173);    MKIVCT(174);    MKIVCT(175);
 766         MKIVCT(176);    MKIVCT(177);    MKIVCT(178);    MKIVCT(179);
 767         MKIVCT(180);    MKIVCT(181);    MKIVCT(182);    MKIVCT(183);
 768         MKIVCT(184);    MKIVCT(185);    MKIVCT(186);    MKIVCT(187);
 769         MKIVCT(188);    MKIVCT(189);    MKIVCT(190);    MKIVCT(191);
 770         MKIVCT(192);    MKIVCT(193);    MKIVCT(194);    MKIVCT(195);
 771         MKIVCT(196);    MKIVCT(197);    MKIVCT(198);    MKIVCT(199);
 772         MKIVCT(200);    MKIVCT(201);    MKIVCT(202);    MKIVCT(203);
 773         MKIVCT(204);    MKIVCT(205);    MKIVCT(206);    MKIVCT(207);
 774         MKIVCT(208);    MKIVCT(209);    MKIVCT(210);    MKIVCT(211);
 775         MKIVCT(212);    MKIVCT(213);    MKIVCT(214);    MKIVCT(215);
 776         MKIVCT(216);    MKIVCT(217);    MKIVCT(218);    MKIVCT(219);
 777         MKIVCT(220);    MKIVCT(221);    MKIVCT(222);    MKIVCT(223);
 778         MKIVCT(224);    MKIVCT(225);    MKIVCT(226);    MKIVCT(227);
 779         MKIVCT(228);    MKIVCT(229);    MKIVCT(230);    MKIVCT(231);
 780         MKIVCT(232);    MKIVCT(233);    MKIVCT(234);    MKIVCT(235);
 781         MKIVCT(236);    MKIVCT(237);    MKIVCT(238);    MKIVCT(239);
 782         MKIVCT(240);    MKIVCT(241);    MKIVCT(242);    MKIVCT(243);
 783         MKIVCT(244);    MKIVCT(245);    MKIVCT(246);    MKIVCT(247);
 784         MKIVCT(248);    MKIVCT(249);    MKIVCT(250);    MKIVCT(251);
 785         MKIVCT(252);    MKIVCT(253);    MKIVCT(254);    MKIVCT(255);
 786 
 787         /*
 788          * We're PCIDE, but we don't have INVPCID.  The only way to invalidate a
 789          * PCID other than the current one, then, is to load its cr3 then
 790          * invlpg.  But loading kf_user_cr3 means we can longer access our
 791          * caller's text mapping (or indeed, its stack).  So this little helper
 792          * has to live within our trampoline text region.
 793          *
 794          * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3)
 795          */
 796         ENTRY_NP(tr_mmu_flush_user_range)
 797         push    %rbx
 798         /* When we read cr3, it never has the NOINVL bit set. */
 799         mov     %cr3, %rax
 800         movq    $CR3_NOINVL_BIT, %rbx
 801         orq     %rbx, %rax
 802 
 803         mov     %rcx, %cr3
 804         add     %rdi, %rsi
 805 .align  ASM_ENTRY_ALIGN
 806 1:
 807         invlpg  (%rdi)
 808         add     %rdx, %rdi
 809         cmp     %rsi, %rdi
 810         jb      1b
 811         mov     %rax, %cr3
 812         pop     %rbx
 813         retq
 814         SET_SIZE(tr_mmu_flush_user_range)
 815 
 816 .align MMU_PAGESIZE
 817 .global kpti_tramp_end
 818 kpti_tramp_end:
 819         nop
 820 
 821 #endif  /* __lint */