1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /*
  12  * Copyright 2018 Joyent, Inc.
  13  */
  14 
  15 /*
  16  * This file contains the trampolines that are used by KPTI in order to be
  17  * able to take interrupts/trap/etc while on the "user" page table.
  18  *
  19  * We don't map the full kernel text into the user page table: instead we
  20  * map this one small section of trampolines (which compiles to ~13 pages).
  21  * These trampolines are set in the IDT always (so they will run no matter
  22  * whether we're on the kernel or user page table), and their primary job is to
  23  * pivot us to the kernel %cr3 and %rsp without ruining everything.
  24  *
  25  * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
  26  * meaning that they will execute with their %rsp set to a known location, even
  27  * if we take them in the kernel.
  28  *
  29  * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
  30  * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
  31  * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
  32  * page-aligned, and we map the page it's on into both page tables. Using a
  33  * struct attached to the cpu_t also means that we can use %rsp-relative
  34  * addressing to find anything on the cpu_t, so we don't have to touch %gs or
  35  * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy).
  36  *
  37  * This little struct is where the CPU will push the actual interrupt frame.
  38  * Then, in the trampoline, we change %cr3, then figure out our destination
  39  * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt
  40  * frame). Then we jump to the regular ISR in the kernel text and carry on as
  41  * normal.
  42  *
  43  * We leave the original frame and any spilled regs behind in the kpti_frame
  44  * lazily until we want to return to userland. Then, we clear any spilled
  45  * regs from it, and overwrite the rest with our iret frame. When switching
  46  * this cpu to a different process (in hat_switch), we bzero the whole region to
  47  * make sure nothing can leak between processes.
  48  *
  49  * When we're returning back to the original place we took the interrupt later
  50  * (especially if it was in userland), we have to jmp back to the "return
  51  * trampolines" here, since when we set %cr3 back to the user value, we need to
  52  * be executing from code here in these shared pages and not the main kernel
  53  * text again. Even though it should be fine to iret directly from kernel text
  54  * when returning to kernel code, we make things jmp to a trampoline here just
  55  * for consistency.
  56  *
  57  * Note that with IST, it's very important that we always must have pivoted
  58  * away from the IST stack before we could possibly take any other interrupt
  59  * on the same IST (unless it's an end-of-the-world fault and we don't care
  60  * about coming back from it ever).
  61  *
  62  * This is particularly relevant to the dbgtrap/brktrap trampolines, as they
  63  * regularly have to happen from within trampoline code (e.g. in the sysenter
  64  * single-step case) and then return to the world normally. As a result, these
  65  * two are IST'd to their own kpti_frame right above the normal one (in the same
  66  * page), so they don't clobber their parent interrupt.
  67  *
  68  * To aid with debugging, we also IST the page fault (#PF/pftrap), general
  69  * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to
  70  * their own separate kpti_frame. This ensures that if we take one of these
  71  * due to a bug in trampoline code, we preserve the original trampoline
  72  * state that caused the trap.
  73  *
  74  * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
  75  * stacks, since they can interrupt another ISR at any time. These stacks are
  76  * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
  77  * their trampolines (and do it unconditionally), and don't bother pivoting
  78  * away. We're either going into the panic() path, or we're going to return
  79  * straight away without rescheduling, so it's fine to not be on our real
  80  * kthread stack (and some of the state we want to go find it with might be
  81  * corrupt!)
  82  *
  83  * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
  84  * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
  85  * point at the PML4 for kas early in boot and never touch it again. Hopefully
  86  * it survives whatever corruption brings down the rest of the kernel!
  87  *
  88  * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
  89  * cases) in that they do not push an interrupt frame (and also have some other
  90  * effects). In the syscall trampolines, we assume that we can only be taking
  91  * the call from userland and use SWAPGS and an unconditional overwrite of %cr3.
  92  * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
  93  * existing %rsp pivot untouched) -- instead we spill registers into
  94  * %gs:CPU_KPTI_* as we need to.
  95  *
  96  * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
  97  * hat_switch().
  98  */
  99 
 100 /*
 101  * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
 102  * fix bugs here check to see if they should be fixed there as well.
 103  */
 104 
 105 #include <sys/asm_linkage.h>
 106 #include <sys/asm_misc.h>
 107 #include <sys/regset.h>
 108 #include <sys/privregs.h>
 109 #include <sys/psw.h>
 110 #include <sys/machbrand.h>
 111 #include <sys/param.h>
 112 
 113 #if defined(__lint)
 114 
 115 #include <sys/types.h>
 116 #include <sys/thread.h>
 117 #include <sys/systm.h>
 118 
 119 #else   /* __lint */
 120 
 121 #include <sys/segments.h>
 122 #include <sys/pcb.h>
 123 #include <sys/trap.h>
 124 #include <sys/ftrace.h>
 125 #include <sys/traptrace.h>
 126 #include <sys/clock.h>
 127 #include <sys/model.h>
 128 #include <sys/panic.h>
 129 
 130 #if defined(__xpv)
 131 #include <sys/hypervisor.h>
 132 #endif
 133 
 134 #include "assym.h"
 135 
 136         .data
 137         DGDEF3(kpti_enable, 8, 8)
 138         .fill   1, 8, 1
 139 
 140 #if DEBUG
 141         .data
 142 _bad_ts_panic_msg:
 143         .string "kpti_trampolines.s: tr_iret_user but CR0.TS set"
 144 #endif
 145 
 146 .section ".text";
 147 .align MMU_PAGESIZE
 148 
 149 .global kpti_tramp_start
 150 kpti_tramp_start:
 151         nop
 152 
 153 /* This will be set by mlsetup, and then double-checked later */
 154 .global kpti_safe_cr3
 155 kpti_safe_cr3:
 156         .quad 0
 157         SET_SIZE(kpti_safe_cr3)
 158 
 159 /* startup_kmem() will overwrite this */
 160 .global kpti_kbase
 161 kpti_kbase:
 162         .quad KERNELBASE
 163         SET_SIZE(kpti_kbase)
 164 
 165 #define SET_KERNEL_CR3(spillreg)                \
 166         mov     %cr3, spillreg;                 \
 167         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 168         mov     %gs:CPU_KPTI_KCR3, spillreg;    \
 169         cmp     $0, spillreg;                   \
 170         je      2f;                             \
 171         mov     spillreg, %cr3;                 \
 172 2:
 173 
 174 #if DEBUG
 175 #define SET_USER_CR3(spillreg)                  \
 176         mov     %cr3, spillreg;                 \
 177         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 178         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 179         mov     spillreg, %cr3
 180 #else
 181 #define SET_USER_CR3(spillreg)                  \
 182         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 183         mov     spillreg, %cr3
 184 #endif
 185 
 186 #define PIVOT_KPTI_STK(spillreg)                \
 187         mov     %rsp, spillreg;                 \
 188         mov     %gs:CPU_KPTI_RET_RSP, %rsp;     \
 189         pushq   T_FRAMERET_SS(spillreg);        \
 190         pushq   T_FRAMERET_RSP(spillreg);       \
 191         pushq   T_FRAMERET_RFLAGS(spillreg);    \
 192         pushq   T_FRAMERET_CS(spillreg);        \
 193         pushq   T_FRAMERET_RIP(spillreg)
 194 
 195 
 196 #define INTERRUPT_TRAMPOLINE_P(errpush) \
 197         pushq   %r13;                           \
 198         pushq   %r14;                           \
 199         subq    $KPTI_R14, %rsp;                \
 200         /* Save current %cr3. */                \
 201         mov     %cr3, %r14;                     \
 202         mov     %r14, KPTI_TR_CR3(%rsp);        \
 203                                                 \
 204         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 205         je      3f;                             \
 206 1:                                              \
 207         /* Change to the "kernel" %cr3 */       \
 208         mov     KPTI_KCR3(%rsp), %r14;          \
 209         cmp     $0, %r14;                       \
 210         je      2f;                             \
 211         mov     %r14, %cr3;                     \
 212 2:                                              \
 213         /* Get our cpu_t in %r13 */             \
 214         mov     %rsp, %r13;                     \
 215         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 216         subq    $CPU_KPTI_START, %r13;          \
 217         /* Use top of the kthread stk */        \
 218         mov     CPU_THREAD(%r13), %r14;         \
 219         mov     T_STACK(%r14), %r14;            \
 220         addq    $REGSIZE+MINFRAME, %r14;        \
 221         jmp     4f;                             \
 222 3:                                              \
 223         /* Check the %rsp in the frame. */      \
 224         /* Is it above kernel base? */          \
 225         mov     kpti_kbase, %r14;               \
 226         cmp     %r14, KPTI_RSP(%rsp);           \
 227         jb      1b;                             \
 228         /* Use the %rsp from the trap frame */  \
 229         mov     KPTI_RSP(%rsp), %r14;           \
 230         and     $(~0xf), %r14;                  \
 231 4:                                              \
 232         mov     %rsp, %r13;                     \
 233         /* %r14 contains our destination stk */ \
 234         mov     %r14, %rsp;                     \
 235         pushq   KPTI_SS(%r13);                  \
 236         pushq   KPTI_RSP(%r13);                 \
 237         pushq   KPTI_RFLAGS(%r13);              \
 238         pushq   KPTI_CS(%r13);                  \
 239         pushq   KPTI_RIP(%r13);                 \
 240         errpush;                                \
 241         mov     KPTI_R14(%r13), %r14;           \
 242         mov     KPTI_R13(%r13), %r13
 243 
 244 #define INTERRUPT_TRAMPOLINE_NOERR              \
 245         INTERRUPT_TRAMPOLINE_P(/**/)
 246 
 247 #define INTERRUPT_TRAMPOLINE                    \
 248         INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 249 
 250 /*
 251  * This is used for all interrupts that can plausibly be taken inside another
 252  * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
 253  *
 254  * We check for whether we took the interrupt while in another trampoline, in
 255  * which case we need to use the kthread stack.
 256  */
 257 #define DBG_INTERRUPT_TRAMPOLINE_P(errpush)     \
 258         pushq   %r13;                           \
 259         pushq   %r14;                           \
 260         subq    $KPTI_R14, %rsp;                \
 261         /* Check for clobbering */              \
 262         cmp     $0, KPTI_FLAG(%rsp);            \
 263         je      1f;                             \
 264         /* Don't worry, this totally works */   \
 265         int     $8;                             \
 266 1:                                              \
 267         movq    $1, KPTI_FLAG(%rsp);            \
 268         /* Save current %cr3. */                \
 269         mov     %cr3, %r14;                     \
 270         mov     %r14, KPTI_TR_CR3(%rsp);        \
 271                                                 \
 272         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 273         je      4f;                             \
 274 2:                                              \
 275         /* Change to the "kernel" %cr3 */       \
 276         mov     KPTI_KCR3(%rsp), %r14;          \
 277         cmp     $0, %r14;                       \
 278         je      3f;                             \
 279         mov     %r14, %cr3;                     \
 280 3:                                              \
 281         /* Get our cpu_t in %r13 */             \
 282         mov     %rsp, %r13;                     \
 283         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 284         subq    $CPU_KPTI_START, %r13;          \
 285         /* Use top of the kthread stk */        \
 286         mov     CPU_THREAD(%r13), %r14;         \
 287         mov     T_STACK(%r14), %r14;            \
 288         addq    $REGSIZE+MINFRAME, %r14;        \
 289         jmp     6f;                             \
 290 4:                                              \
 291         /* Check the %rsp in the frame. */      \
 292         /* Is it above kernel base? */          \
 293         /* If not, treat as user. */            \
 294         mov     kpti_kbase, %r14;               \
 295         cmp     %r14, KPTI_RSP(%rsp);           \
 296         jb      2b;                             \
 297         /* Is it within the kpti_frame page? */ \
 298         /* If it is, treat as user interrupt */ \
 299         mov     %rsp, %r13;                     \
 300         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 301         mov     KPTI_RSP(%rsp), %r14;           \
 302         and     $(~(MMU_PAGESIZE - 1)), %r14;   \
 303         cmp     %r13, %r14;                     \
 304         je      2b;                             \
 305         /* Were we in trampoline code? */       \
 306         leaq    kpti_tramp_start, %r14;         \
 307         cmp     %r14, KPTI_RIP(%rsp);           \
 308         jb      5f;                             \
 309         leaq    kpti_tramp_end, %r14;           \
 310         cmp     %r14, KPTI_RIP(%rsp);           \
 311         ja      5f;                             \
 312         /* If we were, change %cr3: we might */ \
 313         /* have interrupted before it did. */   \
 314         mov     KPTI_KCR3(%rsp), %r14;          \
 315         mov     %r14, %cr3;                     \
 316 5:                                              \
 317         /* Use the %rsp from the trap frame */  \
 318         mov     KPTI_RSP(%rsp), %r14;           \
 319         and     $(~0xf), %r14;                  \
 320 6:                                              \
 321         mov     %rsp, %r13;                     \
 322         /* %r14 contains our destination stk */ \
 323         mov     %r14, %rsp;                     \
 324         pushq   KPTI_SS(%r13);                  \
 325         pushq   KPTI_RSP(%r13);                 \
 326         pushq   KPTI_RFLAGS(%r13);              \
 327         pushq   KPTI_CS(%r13);                  \
 328         pushq   KPTI_RIP(%r13);                 \
 329         errpush;                                \
 330         mov     KPTI_R14(%r13), %r14;           \
 331         movq    $0, KPTI_FLAG(%r13);            \
 332         mov     KPTI_R13(%r13), %r13
 333 
 334 #define DBG_INTERRUPT_TRAMPOLINE_NOERR          \
 335         DBG_INTERRUPT_TRAMPOLINE_P(/**/)
 336 
 337 #define DBG_INTERRUPT_TRAMPOLINE                \
 338         DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 339 
 340         /*
 341          * These labels (_start and _end) are used by trap.c to determine if
 342          * we took an interrupt like an NMI during the return process.
 343          */
 344 .global tr_sysc_ret_start
 345 tr_sysc_ret_start:
 346 
 347         /*
 348          * Syscall return trampolines.
 349          *
 350          * These are expected to be called on the kernel %gs. tr_sysret[ql] are
 351          * called after %rsp is changed back to the user value, so we have no
 352          * stack to work with. tr_sysexit has a kernel stack (but has to
 353          * preserve rflags, soooo).
 354          */
 355         ENTRY_NP(tr_sysretq)
 356         cmpq    $1, kpti_enable
 357         jne     1f
 358 
 359         mov     %r13, %gs:CPU_KPTI_R13
 360         SET_USER_CR3(%r13)
 361         mov     %gs:CPU_KPTI_R13, %r13
 362         /* Zero these to make sure they didn't leak from a kernel trap */
 363         movq    $0, %gs:CPU_KPTI_R13
 364         movq    $0, %gs:CPU_KPTI_R14
 365 1:
 366         swapgs
 367         sysretq
 368         SET_SIZE(tr_sysretq)
 369 
 370         ENTRY_NP(tr_sysretl)
 371         cmpq    $1, kpti_enable
 372         jne     1f
 373 
 374         mov     %r13, %gs:CPU_KPTI_R13
 375         SET_USER_CR3(%r13)
 376         mov     %gs:CPU_KPTI_R13, %r13
 377         /* Zero these to make sure they didn't leak from a kernel trap */
 378         movq    $0, %gs:CPU_KPTI_R13
 379         movq    $0, %gs:CPU_KPTI_R14
 380 1:
 381         SWAPGS
 382         SYSRETL
 383         SET_SIZE(tr_sysretl)
 384 
 385         ENTRY_NP(tr_sysexit)
 386         /*
 387          * Note: we want to preserve RFLAGS across this branch, since sysexit
 388          * (unlike sysret above) does not restore RFLAGS for us.
 389          *
 390          * We still have the real kernel stack (sysexit does restore that), so
 391          * we can use pushfq/popfq.
 392          */
 393         pushfq
 394 
 395         cmpq    $1, kpti_enable
 396         jne     1f
 397 
 398         /* Have to pop it back off now before we change %cr3! */
 399         popfq
 400         mov     %r13, %gs:CPU_KPTI_R13
 401         SET_USER_CR3(%r13)
 402         mov     %gs:CPU_KPTI_R13, %r13
 403         /* Zero these to make sure they didn't leak from a kernel trap */
 404         movq    $0, %gs:CPU_KPTI_R13
 405         movq    $0, %gs:CPU_KPTI_R14
 406         jmp     2f
 407 1:
 408         popfq
 409 2:
 410         swapgs
 411         sti
 412         sysexit
 413         SET_SIZE(tr_sysexit)
 414 
 415 .global tr_sysc_ret_end
 416 tr_sysc_ret_end:
 417 
 418         /*
 419          * Syscall entry trampolines.
 420          */
 421 
 422 #if DEBUG
 423 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 424         ENTRY_NP(tr_/**/isr);                   \
 425         swapgs;                                 \
 426         mov     %r13, %gs:CPU_KPTI_R13;         \
 427         mov     %cr3, %r13;                     \
 428         mov     %r13, %gs:CPU_KPTI_TR_CR3;      \
 429         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 430         mov     %r13, %cr3;                     \
 431         mov     %gs:CPU_KPTI_R13, %r13;         \
 432         swapgs;                                 \
 433         jmp     isr;                            \
 434         SET_SIZE(tr_/**/isr)
 435 #else
 436 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 437         ENTRY_NP(tr_/**/isr);                   \
 438         swapgs;                                 \
 439         mov     %r13, %gs:CPU_KPTI_R13;         \
 440         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 441         mov     %r13, %cr3;                     \
 442         mov     %gs:CPU_KPTI_R13, %r13;         \
 443         swapgs;                                 \
 444         jmp     isr;                            \
 445         SET_SIZE(tr_/**/isr)
 446 #endif
 447 
 448         MK_SYSCALL_TRAMPOLINE(sys_syscall)
 449         MK_SYSCALL_TRAMPOLINE(sys_syscall32)
 450         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall)
 451         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32)
 452 
 453         /*
 454          * SYSENTER is special. The CPU is really not very helpful when it
 455          * comes to preserving and restoring state with it, and as a result
 456          * we have to do all of it by hand. So, since we want to preserve
 457          * RFLAGS, we have to be very careful in these trampolines to not
 458          * clobber any bits in it. That means no cmpqs or branches!
 459          */
 460         ENTRY_NP(tr_sys_sysenter)
 461         swapgs
 462         mov     %r13, %gs:CPU_KPTI_R13
 463 #if DEBUG
 464         mov     %cr3, %r13
 465         mov     %r13, %gs:CPU_KPTI_TR_CR3
 466 #endif
 467         mov     %gs:CPU_KPTI_KCR3, %r13
 468         mov     %r13, %cr3
 469         mov     %gs:CPU_KPTI_R13, %r13
 470         jmp     _sys_sysenter_post_swapgs
 471         SET_SIZE(tr_sys_sysenter)
 472 
 473         ENTRY_NP(tr_brand_sys_sysenter)
 474         swapgs
 475         mov     %r13, %gs:CPU_KPTI_R13
 476 #if DEBUG
 477         mov     %cr3, %r13
 478         mov     %r13, %gs:CPU_KPTI_TR_CR3
 479 #endif
 480         mov     %gs:CPU_KPTI_KCR3, %r13
 481         mov     %r13, %cr3
 482         mov     %gs:CPU_KPTI_R13, %r13
 483         jmp     _brand_sys_sysenter_post_swapgs
 484         SET_SIZE(tr_brand_sys_sysenter)
 485 
 486 #define MK_SYSCALL_INT_TRAMPOLINE(isr)          \
 487         ENTRY_NP(tr_/**/isr);                   \
 488         swapgs;                                 \
 489         mov     %r13, %gs:CPU_KPTI_R13;         \
 490         SET_KERNEL_CR3(%r13);                   \
 491         mov     %gs:CPU_THREAD, %r13;           \
 492         mov     T_STACK(%r13), %r13;            \
 493         addq    $REGSIZE+MINFRAME, %r13;        \
 494         mov     %r13, %rsp;                     \
 495         pushq   %gs:CPU_KPTI_SS;                \
 496         pushq   %gs:CPU_KPTI_RSP;               \
 497         pushq   %gs:CPU_KPTI_RFLAGS;            \
 498         pushq   %gs:CPU_KPTI_CS;                \
 499         pushq   %gs:CPU_KPTI_RIP;               \
 500         mov     %gs:CPU_KPTI_R13, %r13;         \
 501         SWAPGS;                                 \
 502         jmp     isr;                            \
 503         SET_SIZE(tr_/**/isr)
 504 
 505         MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
 506         MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
 507 
 508         /*
 509          * Interrupt/trap return trampolines
 510          */
 511 
 512 .global tr_intr_ret_start
 513 tr_intr_ret_start:
 514 
 515         ENTRY_NP(tr_iret_auto)
 516         cmpq    $1, kpti_enable
 517         jne     tr_iret_kernel
 518         cmpw    $KCS_SEL, T_FRAMERET_CS(%rsp)
 519         je      tr_iret_kernel
 520         jmp     tr_iret_user
 521         SET_SIZE(tr_iret_auto)
 522 
 523         ENTRY_NP(tr_iret_kernel)
 524         /*
 525          * Yes, this does nothing extra. But this way we know if we see iret
 526          * elsewhere, then we've failed to properly consider trampolines there.
 527          */
 528         iretq
 529         SET_SIZE(tr_iret_kernel)
 530 
 531         ENTRY_NP(tr_iret_user)
 532 #if DEBUG
 533         /*
 534          * Ensure that we return to user land with CR0.TS clear. We do this
 535          * before we trampoline back and pivot the stack and %cr3. This way
 536          * we're still on the kernel stack and kernel %cr3, though we are on the
 537          * user GSBASE.
 538          */
 539         pushq   %rax
 540         mov     %cr0, %rax
 541         testq   $CR0_TS, %rax
 542         jz      1f
 543         swapgs
 544         popq    %rax
 545         leaq    _bad_ts_panic_msg(%rip), %rdi
 546         xorl    %eax, %eax
 547         pushq   %rbp
 548         movq    %rsp, %rbp
 549         call    panic
 550 1:
 551         popq    %rax
 552 #endif
 553 
 554         cmpq    $1, kpti_enable
 555         jne     1f
 556 
 557         swapgs
 558         mov     %r13, %gs:CPU_KPTI_R13
 559         PIVOT_KPTI_STK(%r13)
 560         SET_USER_CR3(%r13)
 561         mov     %gs:CPU_KPTI_R13, %r13
 562         /* Zero these to make sure they didn't leak from a kernel trap */
 563         movq    $0, %gs:CPU_KPTI_R13
 564         movq    $0, %gs:CPU_KPTI_R14
 565         swapgs
 566 1:
 567         iretq
 568         SET_SIZE(tr_iret_user)
 569 
 570         /*
 571          * This special return trampoline is for KDI's use only (with kmdb).
 572          *
 573          * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
 574          * instead. This trampoline runs after GSBASE has already been changed
 575          * back to the userland value (so we can't use %gs).
 576          *
 577          * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
 578          * The KPTI_R13 member in the kpti_dbg has already been set to what the
 579          * real %r13 should be before we IRET.
 580          *
 581          * Additionally, KDI keeps a copy of the incoming %cr3 value when it
 582          * took an interrupt, and has put that back in the kpti_dbg area for us
 583          * to use, so we don't do any sniffing of %cs here. This is important
 584          * so that debugging code that changes %cr3 is possible.
 585          */
 586         ENTRY_NP(tr_iret_kdi)
 587         movq    %r14, KPTI_R14(%r13)    /* %r14 has to be preserved by us */
 588 
 589         movq    %rsp, %r14      /* original %rsp is pointing at IRET frame */
 590         leaq    KPTI_TOP(%r13), %rsp
 591         pushq   T_FRAMERET_SS(%r14)
 592         pushq   T_FRAMERET_RSP(%r14)
 593         pushq   T_FRAMERET_RFLAGS(%r14)
 594         pushq   T_FRAMERET_CS(%r14)
 595         pushq   T_FRAMERET_RIP(%r14)
 596 
 597         movq    KPTI_TR_CR3(%r13), %r14
 598         movq    %r14, %cr3
 599 
 600         movq    KPTI_R14(%r13), %r14
 601         movq    KPTI_R13(%r13), %r13    /* preserved by our caller */
 602 
 603         iretq
 604         SET_SIZE(tr_iret_kdi)
 605 
 606 .global tr_intr_ret_end
 607 tr_intr_ret_end:
 608 
 609         /*
 610          * Interrupt/trap entry trampolines
 611          */
 612 
 613         /* CPU pushed an error code, and ISR wants one */
 614 #define MK_INTR_TRAMPOLINE(isr)                 \
 615         ENTRY_NP(tr_/**/isr);                   \
 616         INTERRUPT_TRAMPOLINE;                   \
 617         jmp     isr;                            \
 618         SET_SIZE(tr_/**/isr)
 619 
 620         /* CPU didn't push an error code, and ISR doesn't want one */
 621 #define MK_INTR_TRAMPOLINE_NOERR(isr)           \
 622         ENTRY_NP(tr_/**/isr);                   \
 623         push    $0;                             \
 624         INTERRUPT_TRAMPOLINE_NOERR;             \
 625         jmp     isr;                            \
 626         SET_SIZE(tr_/**/isr)
 627 
 628         /* CPU pushed an error code, and ISR wants one */
 629 #define MK_DBG_INTR_TRAMPOLINE(isr)     \
 630         ENTRY_NP(tr_/**/isr);                   \
 631         DBG_INTERRUPT_TRAMPOLINE;               \
 632         jmp     isr;                            \
 633         SET_SIZE(tr_/**/isr)
 634 
 635         /* CPU didn't push an error code, and ISR doesn't want one */
 636 #define MK_DBG_INTR_TRAMPOLINE_NOERR(isr)       \
 637         ENTRY_NP(tr_/**/isr);                   \
 638         push    $0;                             \
 639         DBG_INTERRUPT_TRAMPOLINE_NOERR;         \
 640         jmp     isr;                            \
 641         SET_SIZE(tr_/**/isr)
 642 
 643 
 644         MK_INTR_TRAMPOLINE_NOERR(div0trap)
 645         MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap)
 646         MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap)
 647         MK_INTR_TRAMPOLINE_NOERR(ovflotrap)
 648         MK_INTR_TRAMPOLINE_NOERR(boundstrap)
 649         MK_INTR_TRAMPOLINE_NOERR(invoptrap)
 650         MK_INTR_TRAMPOLINE_NOERR(ndptrap)
 651         MK_INTR_TRAMPOLINE(invtsstrap)
 652         MK_INTR_TRAMPOLINE(segnptrap)
 653         MK_DBG_INTR_TRAMPOLINE(stktrap)
 654         MK_DBG_INTR_TRAMPOLINE(gptrap)
 655         MK_DBG_INTR_TRAMPOLINE(pftrap)
 656         MK_INTR_TRAMPOLINE_NOERR(resvtrap)
 657         MK_INTR_TRAMPOLINE_NOERR(ndperr)
 658         MK_INTR_TRAMPOLINE(achktrap)
 659         MK_INTR_TRAMPOLINE_NOERR(xmtrap)
 660         MK_INTR_TRAMPOLINE_NOERR(invaltrap)
 661         MK_INTR_TRAMPOLINE_NOERR(fasttrap)
 662         MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
 663 
 664         /*
 665          * These are special because they can interrupt other traps, and
 666          * each other. We don't need to pivot their stacks, because they have
 667          * dedicated IST stack space, but we need to change %cr3.
 668          */
 669         ENTRY_NP(tr_nmiint)
 670         pushq   %r13
 671         mov     kpti_safe_cr3, %r13
 672         mov     %r13, %cr3
 673         popq    %r13
 674         jmp     nmiint
 675         SET_SIZE(tr_nmiint)
 676 
 677 #if !defined(__xpv)
 678         ENTRY_NP(tr_syserrtrap)
 679         /*
 680          * If we got here we should always have a zero error code pushed.
 681          * The INT $0x8 instr doesn't seem to push one, though, which we use
 682          * as an emergency panic in the other trampolines. So adjust things
 683          * here.
 684          */
 685         cmpq    $0, (%rsp)
 686         je      1f
 687         pushq   $0
 688 1:
 689         pushq   %r13
 690         mov     kpti_safe_cr3, %r13
 691         mov     %r13, %cr3
 692         popq    %r13
 693         jmp     syserrtrap
 694         SET_SIZE(tr_syserrtrap)
 695 #endif
 696 
 697         ENTRY_NP(tr_mcetrap)
 698         pushq   %r13
 699         mov     kpti_safe_cr3, %r13
 700         mov     %r13, %cr3
 701         popq    %r13
 702         jmp     mcetrap
 703         SET_SIZE(tr_mcetrap)
 704 
 705         /*
 706          * Interrupts start at 32
 707          */
 708 #define MKIVCT(n)                       \
 709         ENTRY_NP(tr_ivct/**/n)          \
 710         push    $0;                     \
 711         INTERRUPT_TRAMPOLINE;           \
 712         push    $n - 0x20;              \
 713         jmp     cmnint;                 \
 714         SET_SIZE(tr_ivct/**/n)
 715 
 716         MKIVCT(32);     MKIVCT(33);     MKIVCT(34);     MKIVCT(35);
 717         MKIVCT(36);     MKIVCT(37);     MKIVCT(38);     MKIVCT(39);
 718         MKIVCT(40);     MKIVCT(41);     MKIVCT(42);     MKIVCT(43);
 719         MKIVCT(44);     MKIVCT(45);     MKIVCT(46);     MKIVCT(47);
 720         MKIVCT(48);     MKIVCT(49);     MKIVCT(50);     MKIVCT(51);
 721         MKIVCT(52);     MKIVCT(53);     MKIVCT(54);     MKIVCT(55);
 722         MKIVCT(56);     MKIVCT(57);     MKIVCT(58);     MKIVCT(59);
 723         MKIVCT(60);     MKIVCT(61);     MKIVCT(62);     MKIVCT(63);
 724         MKIVCT(64);     MKIVCT(65);     MKIVCT(66);     MKIVCT(67);
 725         MKIVCT(68);     MKIVCT(69);     MKIVCT(70);     MKIVCT(71);
 726         MKIVCT(72);     MKIVCT(73);     MKIVCT(74);     MKIVCT(75);
 727         MKIVCT(76);     MKIVCT(77);     MKIVCT(78);     MKIVCT(79);
 728         MKIVCT(80);     MKIVCT(81);     MKIVCT(82);     MKIVCT(83);
 729         MKIVCT(84);     MKIVCT(85);     MKIVCT(86);     MKIVCT(87);
 730         MKIVCT(88);     MKIVCT(89);     MKIVCT(90);     MKIVCT(91);
 731         MKIVCT(92);     MKIVCT(93);     MKIVCT(94);     MKIVCT(95);
 732         MKIVCT(96);     MKIVCT(97);     MKIVCT(98);     MKIVCT(99);
 733         MKIVCT(100);    MKIVCT(101);    MKIVCT(102);    MKIVCT(103);
 734         MKIVCT(104);    MKIVCT(105);    MKIVCT(106);    MKIVCT(107);
 735         MKIVCT(108);    MKIVCT(109);    MKIVCT(110);    MKIVCT(111);
 736         MKIVCT(112);    MKIVCT(113);    MKIVCT(114);    MKIVCT(115);
 737         MKIVCT(116);    MKIVCT(117);    MKIVCT(118);    MKIVCT(119);
 738         MKIVCT(120);    MKIVCT(121);    MKIVCT(122);    MKIVCT(123);
 739         MKIVCT(124);    MKIVCT(125);    MKIVCT(126);    MKIVCT(127);
 740         MKIVCT(128);    MKIVCT(129);    MKIVCT(130);    MKIVCT(131);
 741         MKIVCT(132);    MKIVCT(133);    MKIVCT(134);    MKIVCT(135);
 742         MKIVCT(136);    MKIVCT(137);    MKIVCT(138);    MKIVCT(139);
 743         MKIVCT(140);    MKIVCT(141);    MKIVCT(142);    MKIVCT(143);
 744         MKIVCT(144);    MKIVCT(145);    MKIVCT(146);    MKIVCT(147);
 745         MKIVCT(148);    MKIVCT(149);    MKIVCT(150);    MKIVCT(151);
 746         MKIVCT(152);    MKIVCT(153);    MKIVCT(154);    MKIVCT(155);
 747         MKIVCT(156);    MKIVCT(157);    MKIVCT(158);    MKIVCT(159);
 748         MKIVCT(160);    MKIVCT(161);    MKIVCT(162);    MKIVCT(163);
 749         MKIVCT(164);    MKIVCT(165);    MKIVCT(166);    MKIVCT(167);
 750         MKIVCT(168);    MKIVCT(169);    MKIVCT(170);    MKIVCT(171);
 751         MKIVCT(172);    MKIVCT(173);    MKIVCT(174);    MKIVCT(175);
 752         MKIVCT(176);    MKIVCT(177);    MKIVCT(178);    MKIVCT(179);
 753         MKIVCT(180);    MKIVCT(181);    MKIVCT(182);    MKIVCT(183);
 754         MKIVCT(184);    MKIVCT(185);    MKIVCT(186);    MKIVCT(187);
 755         MKIVCT(188);    MKIVCT(189);    MKIVCT(190);    MKIVCT(191);
 756         MKIVCT(192);    MKIVCT(193);    MKIVCT(194);    MKIVCT(195);
 757         MKIVCT(196);    MKIVCT(197);    MKIVCT(198);    MKIVCT(199);
 758         MKIVCT(200);    MKIVCT(201);    MKIVCT(202);    MKIVCT(203);
 759         MKIVCT(204);    MKIVCT(205);    MKIVCT(206);    MKIVCT(207);
 760         MKIVCT(208);    MKIVCT(209);    MKIVCT(210);    MKIVCT(211);
 761         MKIVCT(212);    MKIVCT(213);    MKIVCT(214);    MKIVCT(215);
 762         MKIVCT(216);    MKIVCT(217);    MKIVCT(218);    MKIVCT(219);
 763         MKIVCT(220);    MKIVCT(221);    MKIVCT(222);    MKIVCT(223);
 764         MKIVCT(224);    MKIVCT(225);    MKIVCT(226);    MKIVCT(227);
 765         MKIVCT(228);    MKIVCT(229);    MKIVCT(230);    MKIVCT(231);
 766         MKIVCT(232);    MKIVCT(233);    MKIVCT(234);    MKIVCT(235);
 767         MKIVCT(236);    MKIVCT(237);    MKIVCT(238);    MKIVCT(239);
 768         MKIVCT(240);    MKIVCT(241);    MKIVCT(242);    MKIVCT(243);
 769         MKIVCT(244);    MKIVCT(245);    MKIVCT(246);    MKIVCT(247);
 770         MKIVCT(248);    MKIVCT(249);    MKIVCT(250);    MKIVCT(251);
 771         MKIVCT(252);    MKIVCT(253);    MKIVCT(254);    MKIVCT(255);
 772 
 773         /*
 774          * We're PCIDE, but we don't have INVPCID.  The only way to invalidate a
 775          * PCID other than the current one, then, is to load its cr3 then
 776          * invlpg.  But loading kf_user_cr3 means we can longer access our
 777          * caller's text mapping (or indeed, its stack).  So this little helper
 778          * has to live within our trampoline text region.
 779          *
 780          * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3)
 781          */
 782         ENTRY_NP(tr_mmu_flush_user_range)
 783         push    %rbx
 784         /* When we read cr3, it never has the NOINVL bit set. */
 785         mov     %cr3, %rax
 786         movq    $CR3_NOINVL_BIT, %rbx
 787         orq     %rbx, %rax
 788 
 789         mov     %rcx, %cr3
 790         add     %rdi, %rsi
 791 .align  ASM_ENTRY_ALIGN
 792 1:
 793         invlpg  (%rdi)
 794         add     %rdx, %rdi
 795         cmp     %rsi, %rdi
 796         jb      1b
 797         mov     %rax, %cr3
 798         pop     %rbx
 799         retq
 800         SET_SIZE(tr_mmu_flush_user_range)
 801 
 802 .align MMU_PAGESIZE
 803 .global kpti_tramp_end
 804 kpti_tramp_end:
 805         nop
 806 
 807 #endif  /* __lint */