illumos-gate New usr/src/uts/i86pc/ml/kpti

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /*
  12  * Copyright 2018 Joyent, Inc.
  13  */
  14 
  15 /*
  16  * This file contains the trampolines that are used by KPTI in order to be
  17  * able to take interrupts/trap/etc while on the "user" page table.
  18  *
  19  * We don't map the full kernel text into the user page table: instead we
  20  * map this one small section of trampolines (which compiles to ~13 pages).
  21  * These trampolines are set in the IDT always (so they will run no matter
  22  * whether we're on the kernel or user page table), and their primary job is to
  23  * pivot us to the kernel %cr3 and %rsp without ruining everything.
  24  *
  25  * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
  26  * meaning that they will execute with their %rsp set to a known location, even
  27  * if we take them in the kernel.
  28  *
  29  * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
  30  * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
  31  * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
  32  * page-aligned, and we map the page it's on into both page tables. Using a
  33  * struct attached to the cpu_t also means that we can use %rsp-relative
  34  * addressing to find anything on the cpu_t, so we don't have to touch %gs or
  35  * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy).
  36  *
  37  * This little struct is where the CPU will push the actual interrupt frame.
  38  * Then, in the trampoline, we change %cr3, then figure out our destination
  39  * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt
  40  * frame). Then we jump to the regular ISR in the kernel text and carry on as
  41  * normal.
  42  *
  43  * We leave the original frame and any spilled regs behind in the kpti_frame
  44  * lazily until we want to return to userland. Then, we clear any spilled
  45  * regs from it, and overwrite the rest with our iret frame. When switching
  46  * this cpu to a different process (in hat_switch), we bzero the whole region to
  47  * make sure nothing can leak between processes.
  48  *
  49  * When we're returning back to the original place we took the interrupt later
  50  * (especially if it was in userland), we have to jmp back to the "return
  51  * trampolines" here, since when we set %cr3 back to the user value, we need to
  52  * be executing from code here in these shared pages and not the main kernel
  53  * text again. Even though it should be fine to iret directly from kernel text
  54  * when returning to kernel code, we make things jmp to a trampoline here just
  55  * for consistency.
  56  *
  57  * Note that with IST, it's very important that we always must have pivoted
  58  * away from the IST stack before we could possibly take any other interrupt
  59  * on the same IST (unless it's an end-of-the-world fault and we don't care
  60  * about coming back from it ever).
  61  *
  62  * This is particularly relevant to the dbgtrap/brktrap trampolines, as they
  63  * regularly have to happen from within trampoline code (e.g. in the sysenter
  64  * single-step case) and then return to the world normally. As a result, these
  65  * two are IST'd to their own kpti_frame right above the normal one (in the same
  66  * page), so they don't clobber their parent interrupt.
  67  *
  68  * To aid with debugging, we also IST the page fault (#PF/pftrap), general
  69  * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to
  70  * their own separate kpti_frame. This ensures that if we take one of these
  71  * due to a bug in trampoline code, we preserve the original trampoline
  72  * state that caused the trap.
  73  *
  74  * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
  75  * stacks, since they can interrupt another ISR at any time. These stacks are
  76  * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
  77  * their trampolines (and do it unconditionally), and don't bother pivoting
  78  * away. We're either going into the panic() path, or we're going to return
  79  * straight away without rescheduling, so it's fine to not be on our real
  80  * kthread stack (and some of the state we want to go find it with might be
  81  * corrupt!)
  82  *
  83  * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
  84  * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
  85  * point at the PML4 for kas early in boot and never touch it again. Hopefully
  86  * it survives whatever corruption brings down the rest of the kernel!
  87  *
  88  * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
  89  * cases) in that they do not push an interrupt frame (and also have some other
  90  * effects). In the syscall trampolines, we assume that we can only be taking
  91  * the call from userland and use SWAPGS and an unconditional overwrite of %cr3.
  92  * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
  93  * existing %rsp pivot untouched) -- instead we spill registers into
  94  * %gs:CPU_KPTI_* as we need to.
  95  *
  96  * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
  97  * hat_switch().
  98  */
  99 
 100 /*
 101  * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
 102  * fix bugs here check to see if they should be fixed there as well.
 103  */
 104 
 105 #include <sys/asm_linkage.h>
 106 #include <sys/asm_misc.h>
 107 #include <sys/regset.h>
 108 #include <sys/privregs.h>
 109 #include <sys/psw.h>
 110 #include <sys/machbrand.h>
 111 #include <sys/param.h>
 112 
 113 #if defined(__lint)
 114 
 115 #include <sys/types.h>
 116 #include <sys/thread.h>
 117 #include <sys/systm.h>
 118 
 119 #else   /* __lint */
 120 
 121 #include <sys/segments.h>
 122 #include <sys/pcb.h>
 123 #include <sys/trap.h>
 124 #include <sys/ftrace.h>
 125 #include <sys/traptrace.h>
 126 #include <sys/clock.h>
 127 #include <sys/model.h>
 128 #include <sys/panic.h>
 129 
 130 #if defined(__xpv)
 131 #include <sys/hypervisor.h>
 132 #endif
 133 
 134 #include "assym.h"
 135 
 136         .data
 137         DGDEF3(kpti_enable, 8, 8)
 138         .fill   1, 8, 1
 139 
 140 .section ".text";
 141 .align MMU_PAGESIZE
 142 
 143 .global kpti_tramp_start
 144 kpti_tramp_start:
 145         nop
 146 
 147 /* This will be set by mlsetup, and then double-checked later */
 148 .global kpti_safe_cr3
 149 kpti_safe_cr3:
 150         .quad 0
 151         SET_SIZE(kpti_safe_cr3)
 152 
 153 /* startup_kmem() will overwrite this */
 154 .global kpti_kbase
 155 kpti_kbase:
 156         .quad KERNELBASE
 157         SET_SIZE(kpti_kbase)
 158 
 159 #define SET_KERNEL_CR3(spillreg)                \
 160         mov     %cr3, spillreg;                 \
 161         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 162         mov     %gs:CPU_KPTI_KCR3, spillreg;    \
 163         cmp     $0, spillreg;                   \
 164         je      2f;                             \
 165         mov     spillreg, %cr3;                 \
 166 2:
 167 
 168 #if DEBUG
 169 #define SET_USER_CR3(spillreg)                  \
 170         mov     %cr3, spillreg;                 \
 171         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 172         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 173         mov     spillreg, %cr3
 174 #else
 175 #define SET_USER_CR3(spillreg)                  \
 176         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 177         mov     spillreg, %cr3
 178 #endif
 179 
 180 #define PIVOT_KPTI_STK(spillreg)                \
 181         mov     %rsp, spillreg;                 \
 182         mov     %gs:CPU_KPTI_RET_RSP, %rsp;     \
 183         pushq   T_FRAMERET_SS(spillreg);        \
 184         pushq   T_FRAMERET_RSP(spillreg);       \
 185         pushq   T_FRAMERET_RFLAGS(spillreg);    \
 186         pushq   T_FRAMERET_CS(spillreg);        \
 187         pushq   T_FRAMERET_RIP(spillreg)
 188 
 189 
 190 #define INTERRUPT_TRAMPOLINE_P(errpush) \
 191         pushq   %r13;                           \
 192         pushq   %r14;                           \
 193         subq    $KPTI_R14, %rsp;                \
 194         /* Save current %cr3. */                \
 195         mov     %cr3, %r14;                     \
 196         mov     %r14, KPTI_TR_CR3(%rsp);        \
 197                                                 \
 198         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 199         je      3f;                             \
 200 1:                                              \
 201         /* Change to the "kernel" %cr3 */       \
 202         mov     KPTI_KCR3(%rsp), %r14;          \
 203         cmp     $0, %r14;                       \
 204         je      2f;                             \
 205         mov     %r14, %cr3;                     \
 206 2:                                              \
 207         /* Get our cpu_t in %r13 */             \
 208         mov     %rsp, %r13;                     \
 209         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 210         subq    $CPU_KPTI_START, %r13;          \
 211         /* Use top of the kthread stk */        \
 212         mov     CPU_THREAD(%r13), %r14;         \
 213         mov     T_STACK(%r14), %r14;            \
 214         addq    $REGSIZE+MINFRAME, %r14;        \
 215         jmp     4f;                             \
 216 3:                                              \
 217         /* Check the %rsp in the frame. */      \
 218         /* Is it above kernel base? */          \
 219         mov     kpti_kbase, %r14;               \
 220         cmp     %r14, KPTI_RSP(%rsp);           \
 221         jb      1b;                             \
 222         /* Use the %rsp from the trap frame */  \
 223         mov     KPTI_RSP(%rsp), %r14;           \
 224         and     $(~0xf), %r14;                  \
 225 4:                                              \
 226         mov     %rsp, %r13;                     \
 227         /* %r14 contains our destination stk */ \
 228         mov     %r14, %rsp;                     \
 229         pushq   KPTI_SS(%r13);                  \
 230         pushq   KPTI_RSP(%r13);                 \
 231         pushq   KPTI_RFLAGS(%r13);              \
 232         pushq   KPTI_CS(%r13);                  \
 233         pushq   KPTI_RIP(%r13);                 \
 234         errpush;                                \
 235         mov     KPTI_R14(%r13), %r14;           \
 236         mov     KPTI_R13(%r13), %r13
 237 
 238 #define INTERRUPT_TRAMPOLINE_NOERR              \
 239         INTERRUPT_TRAMPOLINE_P(/**/)
 240 
 241 #define INTERRUPT_TRAMPOLINE                    \
 242         INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 243 
 244 /*
 245  * This is used for all interrupts that can plausibly be taken inside another
 246  * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
 247  *
 248  * We check for whether we took the interrupt while in another trampoline, in
 249  * which case we need to use the kthread stack.
 250  */
 251 #define DBG_INTERRUPT_TRAMPOLINE_P(errpush)     \
 252         pushq   %r13;                           \
 253         pushq   %r14;                           \
 254         subq    $KPTI_R14, %rsp;                \
 255         /* Check for clobbering */              \
 256         cmp     $0, KPTI_FLAG(%rsp);            \
 257         je      1f;                             \
 258         /* Don't worry, this totally works */   \
 259         int     $8;                             \
 260 1:                                              \
 261         movq    $1, KPTI_FLAG(%rsp);            \
 262         /* Save current %cr3. */                \
 263         mov     %cr3, %r14;                     \
 264         mov     %r14, KPTI_TR_CR3(%rsp);        \
 265                                                 \
 266         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 267         je      4f;                             \
 268 2:                                              \
 269         /* Change to the "kernel" %cr3 */       \
 270         mov     KPTI_KCR3(%rsp), %r14;          \
 271         cmp     $0, %r14;                       \
 272         je      3f;                             \
 273         mov     %r14, %cr3;                     \
 274 3:                                              \
 275         /* Get our cpu_t in %r13 */             \
 276         mov     %rsp, %r13;                     \
 277         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 278         subq    $CPU_KPTI_START, %r13;          \
 279         /* Use top of the kthread stk */        \
 280         mov     CPU_THREAD(%r13), %r14;         \
 281         mov     T_STACK(%r14), %r14;            \
 282         addq    $REGSIZE+MINFRAME, %r14;        \
 283         jmp     6f;                             \
 284 4:                                              \
 285         /* Check the %rsp in the frame. */      \
 286         /* Is it above kernel base? */          \
 287         /* If not, treat as user. */            \
 288         mov     kpti_kbase, %r14;               \
 289         cmp     %r14, KPTI_RSP(%rsp);           \
 290         jb      2b;                             \
 291         /* Is it within the kpti_frame page? */ \
 292         /* If it is, treat as user interrupt */ \
 293         mov     %rsp, %r13;                     \
 294         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 295         mov     KPTI_RSP(%rsp), %r14;           \
 296         and     $(~(MMU_PAGESIZE - 1)), %r14;   \
 297         cmp     %r13, %r14;                     \
 298         je      2b;                             \
 299         /* Were we in trampoline code? */       \
 300         leaq    kpti_tramp_start, %r14;         \
 301         cmp     %r14, KPTI_RIP(%rsp);           \
 302         jb      5f;                             \
 303         leaq    kpti_tramp_end, %r14;           \
 304         cmp     %r14, KPTI_RIP(%rsp);           \
 305         ja      5f;                             \
 306         /* If we were, change %cr3: we might */ \
 307         /* have interrupted before it did. */   \
 308         mov     KPTI_KCR3(%rsp), %r14;          \
 309         mov     %r14, %cr3;                     \
 310 5:                                              \
 311         /* Use the %rsp from the trap frame */  \
 312         mov     KPTI_RSP(%rsp), %r14;           \
 313         and     $(~0xf), %r14;                  \
 314 6:                                              \
 315         mov     %rsp, %r13;                     \
 316         /* %r14 contains our destination stk */ \
 317         mov     %r14, %rsp;                     \
 318         pushq   KPTI_SS(%r13);                  \
 319         pushq   KPTI_RSP(%r13);                 \
 320         pushq   KPTI_RFLAGS(%r13);              \
 321         pushq   KPTI_CS(%r13);                  \
 322         pushq   KPTI_RIP(%r13);                 \
 323         errpush;                                \
 324         mov     KPTI_R14(%r13), %r14;           \
 325         movq    $0, KPTI_FLAG(%r13);            \
 326         mov     KPTI_R13(%r13), %r13
 327 
 328 #define DBG_INTERRUPT_TRAMPOLINE_NOERR          \
 329         DBG_INTERRUPT_TRAMPOLINE_P(/**/)
 330 
 331 #define DBG_INTERRUPT_TRAMPOLINE                \
 332         DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 333 
 334         /*
 335          * These labels (_start and _end) are used by trap.c to determine if
 336          * we took an interrupt like an NMI during the return process.
 337          */
 338 .global tr_sysc_ret_start
 339 tr_sysc_ret_start:
 340 
 341         /*
 342          * Syscall return trampolines.
 343          *
 344          * These are expected to be called on the kernel %gs. tr_sysret[ql] are
 345          * called after %rsp is changed back to the user value, so we have no
 346          * stack to work with. tr_sysexit has a kernel stack (but has to
 347          * preserve rflags, soooo).
 348          */
 349         ENTRY_NP(tr_sysretq)
 350         cmpq    $1, kpti_enable
 351         jne     1f
 352 
 353         mov     %r13, %gs:CPU_KPTI_R13
 354         SET_USER_CR3(%r13)
 355         mov     %gs:CPU_KPTI_R13, %r13
 356         /* Zero these to make sure they didn't leak from a kernel trap */
 357         movq    $0, %gs:CPU_KPTI_R13
 358         movq    $0, %gs:CPU_KPTI_R14
 359 1:
 360         swapgs
 361         sysretq
 362         SET_SIZE(tr_sysretq)
 363 
 364         ENTRY_NP(tr_sysretl)
 365         cmpq    $1, kpti_enable
 366         jne     1f
 367 
 368         mov     %r13, %gs:CPU_KPTI_R13
 369         SET_USER_CR3(%r13)
 370         mov     %gs:CPU_KPTI_R13, %r13
 371         /* Zero these to make sure they didn't leak from a kernel trap */
 372         movq    $0, %gs:CPU_KPTI_R13
 373         movq    $0, %gs:CPU_KPTI_R14
 374 1:
 375         SWAPGS
 376         SYSRETL
 377         SET_SIZE(tr_sysretl)
 378 
 379         ENTRY_NP(tr_sysexit)
 380         /*
 381          * Note: we want to preserve RFLAGS across this branch, since sysexit
 382          * (unlike sysret above) does not restore RFLAGS for us.
 383          *
 384          * We still have the real kernel stack (sysexit does restore that), so
 385          * we can use pushfq/popfq.
 386          */
 387         pushfq
 388 
 389         cmpq    $1, kpti_enable
 390         jne     1f
 391 
 392         /* Have to pop it back off now before we change %cr3! */
 393         popfq
 394         mov     %r13, %gs:CPU_KPTI_R13
 395         SET_USER_CR3(%r13)
 396         mov     %gs:CPU_KPTI_R13, %r13
 397         /* Zero these to make sure they didn't leak from a kernel trap */
 398         movq    $0, %gs:CPU_KPTI_R13
 399         movq    $0, %gs:CPU_KPTI_R14
 400         jmp     2f
 401 1:
 402         popfq
 403 2:
 404         swapgs
 405         sti
 406         sysexit
 407         SET_SIZE(tr_sysexit)
 408 
 409 .global tr_sysc_ret_end
 410 tr_sysc_ret_end:
 411 
 412         /*
 413          * Syscall entry trampolines.
 414          */
 415 
 416 #if DEBUG
 417 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 418         ENTRY_NP(tr_/**/isr);                   \
 419         swapgs;                                 \
 420         mov     %r13, %gs:CPU_KPTI_R13;         \
 421         mov     %cr3, %r13;                     \
 422         mov     %r13, %gs:CPU_KPTI_TR_CR3;      \
 423         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 424         mov     %r13, %cr3;                     \
 425         mov     %gs:CPU_KPTI_R13, %r13;         \
 426         swapgs;                                 \
 427         jmp     isr;                            \
 428         SET_SIZE(tr_/**/isr)
 429 #else
 430 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 431         ENTRY_NP(tr_/**/isr);                   \
 432         swapgs;                                 \
 433         mov     %r13, %gs:CPU_KPTI_R13;         \
 434         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 435         mov     %r13, %cr3;                     \
 436         mov     %gs:CPU_KPTI_R13, %r13;         \
 437         swapgs;                                 \
 438         jmp     isr;                            \
 439         SET_SIZE(tr_/**/isr)
 440 #endif
 441 
 442         MK_SYSCALL_TRAMPOLINE(sys_syscall)
 443         MK_SYSCALL_TRAMPOLINE(sys_syscall32)
 444         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall)
 445         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32)
 446 
 447         /*
 448          * SYSENTER is special. The CPU is really not very helpful when it
 449          * comes to preserving and restoring state with it, and as a result
 450          * we have to do all of it by hand. So, since we want to preserve
 451          * RFLAGS, we have to be very careful in these trampolines to not
 452          * clobber any bits in it. That means no cmpqs or branches!
 453          */
 454         ENTRY_NP(tr_sys_sysenter)
 455         swapgs
 456         mov     %r13, %gs:CPU_KPTI_R13
 457 #if DEBUG
 458         mov     %cr3, %r13
 459         mov     %r13, %gs:CPU_KPTI_TR_CR3
 460 #endif
 461         mov     %gs:CPU_KPTI_KCR3, %r13
 462         mov     %r13, %cr3
 463         mov     %gs:CPU_KPTI_R13, %r13
 464         jmp     _sys_sysenter_post_swapgs
 465         SET_SIZE(tr_sys_sysenter)
 466 
 467         ENTRY_NP(tr_brand_sys_sysenter)
 468         swapgs
 469         mov     %r13, %gs:CPU_KPTI_R13
 470 #if DEBUG
 471         mov     %cr3, %r13
 472         mov     %r13, %gs:CPU_KPTI_TR_CR3
 473 #endif
 474         mov     %gs:CPU_KPTI_KCR3, %r13
 475         mov     %r13, %cr3
 476         mov     %gs:CPU_KPTI_R13, %r13
 477         jmp     _brand_sys_sysenter_post_swapgs
 478         SET_SIZE(tr_brand_sys_sysenter)
 479 
 480 #define MK_SYSCALL_INT_TRAMPOLINE(isr)          \
 481         ENTRY_NP(tr_/**/isr);                   \
 482         swapgs;                                 \
 483         mov     %r13, %gs:CPU_KPTI_R13;         \
 484         SET_KERNEL_CR3(%r13);                   \
 485         mov     %gs:CPU_THREAD, %r13;           \
 486         mov     T_STACK(%r13), %r13;            \
 487         addq    $REGSIZE+MINFRAME, %r13;        \
 488         mov     %r13, %rsp;                     \
 489         pushq   %gs:CPU_KPTI_SS;                \
 490         pushq   %gs:CPU_KPTI_RSP;               \
 491         pushq   %gs:CPU_KPTI_RFLAGS;            \
 492         pushq   %gs:CPU_KPTI_CS;                \
 493         pushq   %gs:CPU_KPTI_RIP;               \
 494         mov     %gs:CPU_KPTI_R13, %r13;         \
 495         SWAPGS;                                 \
 496         jmp     isr;                            \
 497         SET_SIZE(tr_/**/isr)
 498 
 499         MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
 500         MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
 501 
 502         /*
 503          * Interrupt/trap return trampolines
 504          */
 505 
 506 .global tr_intr_ret_start
 507 tr_intr_ret_start:
 508 
 509         ENTRY_NP(tr_iret_auto)
 510         cmpq    $1, kpti_enable
 511         jne     tr_iret_kernel
 512         cmpw    $KCS_SEL, T_FRAMERET_CS(%rsp)
 513         je      tr_iret_kernel
 514         jmp     tr_iret_user
 515         SET_SIZE(tr_iret_auto)
 516 
 517         ENTRY_NP(tr_iret_kernel)
 518         /*
 519          * Yes, this does nothing extra. But this way we know if we see iret
 520          * elsewhere, then we've failed to properly consider trampolines there.
 521          */
 522         iretq
 523         SET_SIZE(tr_iret_kernel)
 524 
 525         ENTRY_NP(tr_iret_user)
 526         cmpq    $1, kpti_enable
 527         jne     1f
 528 
 529         swapgs
 530         mov     %r13, %gs:CPU_KPTI_R13
 531         PIVOT_KPTI_STK(%r13)
 532         SET_USER_CR3(%r13)
 533         mov     %gs:CPU_KPTI_R13, %r13
 534         /* Zero these to make sure they didn't leak from a kernel trap */
 535         movq    $0, %gs:CPU_KPTI_R13
 536         movq    $0, %gs:CPU_KPTI_R14
 537         swapgs
 538 1:
 539         iretq
 540         SET_SIZE(tr_iret_user)
 541 
 542         /*
 543          * This special return trampoline is for KDI's use only (with kmdb).
 544          *
 545          * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
 546          * instead. This trampoline runs after GSBASE has already been changed
 547          * back to the userland value (so we can't use %gs).
 548          *
 549          * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
 550          * The KPTI_R13 member in the kpti_dbg has already been set to what the
 551          * real %r13 should be before we IRET.
 552          *
 553          * Additionally, KDI keeps a copy of the incoming %cr3 value when it
 554          * took an interrupt, and has put that back in the kpti_dbg area for us
 555          * to use, so we don't do any sniffing of %cs here. This is important
 556          * so that debugging code that changes %cr3 is possible.
 557          */
 558         ENTRY_NP(tr_iret_kdi)
 559         movq    %r14, KPTI_R14(%r13)    /* %r14 has to be preserved by us */
 560 
 561         movq    %rsp, %r14      /* original %rsp is pointing at IRET frame */
 562         leaq    KPTI_TOP(%r13), %rsp
 563         pushq   T_FRAMERET_SS(%r14)
 564         pushq   T_FRAMERET_RSP(%r14)
 565         pushq   T_FRAMERET_RFLAGS(%r14)
 566         pushq   T_FRAMERET_CS(%r14)
 567         pushq   T_FRAMERET_RIP(%r14)
 568 
 569         movq    KPTI_TR_CR3(%r13), %r14
 570         movq    %r14, %cr3
 571 
 572         movq    KPTI_R14(%r13), %r14
 573         movq    KPTI_R13(%r13), %r13    /* preserved by our caller */
 574 
 575         iretq
 576         SET_SIZE(tr_iret_kdi)
 577 
 578 .global tr_intr_ret_end
 579 tr_intr_ret_end:
 580 
 581         /*
 582          * Interrupt/trap entry trampolines
 583          */
 584 
 585         /* CPU pushed an error code, and ISR wants one */
 586 #define MK_INTR_TRAMPOLINE(isr)                 \
 587         ENTRY_NP(tr_/**/isr);                   \
 588         INTERRUPT_TRAMPOLINE;                   \
 589         jmp     isr;                            \
 590         SET_SIZE(tr_/**/isr)
 591 
 592         /* CPU didn't push an error code, and ISR doesn't want one */
 593 #define MK_INTR_TRAMPOLINE_NOERR(isr)           \
 594         ENTRY_NP(tr_/**/isr);                   \
 595         push    $0;                             \
 596         INTERRUPT_TRAMPOLINE_NOERR;             \
 597         jmp     isr;                            \
 598         SET_SIZE(tr_/**/isr)
 599 
 600         /* CPU pushed an error code, and ISR wants one */
 601 #define MK_DBG_INTR_TRAMPOLINE(isr)     \
 602         ENTRY_NP(tr_/**/isr);                   \
 603         DBG_INTERRUPT_TRAMPOLINE;               \
 604         jmp     isr;                            \
 605         SET_SIZE(tr_/**/isr)
 606 
 607         /* CPU didn't push an error code, and ISR doesn't want one */
 608 #define MK_DBG_INTR_TRAMPOLINE_NOERR(isr)       \
 609         ENTRY_NP(tr_/**/isr);                   \
 610         push    $0;                             \
 611         DBG_INTERRUPT_TRAMPOLINE_NOERR;         \
 612         jmp     isr;                            \
 613         SET_SIZE(tr_/**/isr)
 614 
 615 
 616         MK_INTR_TRAMPOLINE_NOERR(div0trap)
 617         MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap)
 618         MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap)
 619         MK_INTR_TRAMPOLINE_NOERR(ovflotrap)
 620         MK_INTR_TRAMPOLINE_NOERR(boundstrap)
 621         MK_INTR_TRAMPOLINE_NOERR(invoptrap)
 622         MK_INTR_TRAMPOLINE_NOERR(ndptrap)
 623         MK_INTR_TRAMPOLINE(invtsstrap)
 624         MK_INTR_TRAMPOLINE(segnptrap)
 625         MK_DBG_INTR_TRAMPOLINE(stktrap)
 626         MK_DBG_INTR_TRAMPOLINE(gptrap)
 627         MK_DBG_INTR_TRAMPOLINE(pftrap)
 628         MK_INTR_TRAMPOLINE_NOERR(resvtrap)
 629         MK_INTR_TRAMPOLINE_NOERR(ndperr)
 630         MK_INTR_TRAMPOLINE(achktrap)
 631         MK_INTR_TRAMPOLINE_NOERR(xmtrap)
 632         MK_INTR_TRAMPOLINE_NOERR(invaltrap)
 633         MK_INTR_TRAMPOLINE_NOERR(fasttrap)
 634         MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
 635 
 636         /*
 637          * These are special because they can interrupt other traps, and
 638          * each other. We don't need to pivot their stacks, because they have
 639          * dedicated IST stack space, but we need to change %cr3.
 640          */
 641         ENTRY_NP(tr_nmiint)
 642         pushq   %r13
 643         mov     kpti_safe_cr3, %r13
 644         mov     %r13, %cr3
 645         popq    %r13
 646         jmp     nmiint
 647         SET_SIZE(tr_nmiint)
 648 
 649 #if !defined(__xpv)
 650         ENTRY_NP(tr_syserrtrap)
 651         /*
 652          * If we got here we should always have a zero error code pushed.
 653          * The INT $0x8 instr doesn't seem to push one, though, which we use
 654          * as an emergency panic in the other trampolines. So adjust things
 655          * here.
 656          */
 657         cmpq    $0, (%rsp)
 658         je      1f
 659         pushq   $0
 660 1:
 661         pushq   %r13
 662         mov     kpti_safe_cr3, %r13
 663         mov     %r13, %cr3
 664         popq    %r13
 665         jmp     syserrtrap
 666         SET_SIZE(tr_syserrtrap)
 667 #endif
 668 
 669         ENTRY_NP(tr_mcetrap)
 670         pushq   %r13
 671         mov     kpti_safe_cr3, %r13
 672         mov     %r13, %cr3
 673         popq    %r13
 674         jmp     mcetrap
 675         SET_SIZE(tr_mcetrap)
 676 
 677         /*
 678          * Interrupts start at 32
 679          */
 680 #define MKIVCT(n)                       \
 681         ENTRY_NP(tr_ivct/**/n)          \
 682         push    $0;                     \
 683         INTERRUPT_TRAMPOLINE;           \
 684         push    $n - 0x20;              \
 685         jmp     cmnint;                 \
 686         SET_SIZE(tr_ivct/**/n)
 687 
 688         MKIVCT(32);     MKIVCT(33);     MKIVCT(34);     MKIVCT(35);
 689         MKIVCT(36);     MKIVCT(37);     MKIVCT(38);     MKIVCT(39);
 690         MKIVCT(40);     MKIVCT(41);     MKIVCT(42);     MKIVCT(43);
 691         MKIVCT(44);     MKIVCT(45);     MKIVCT(46);     MKIVCT(47);
 692         MKIVCT(48);     MKIVCT(49);     MKIVCT(50);     MKIVCT(51);
 693         MKIVCT(52);     MKIVCT(53);     MKIVCT(54);     MKIVCT(55);
 694         MKIVCT(56);     MKIVCT(57);     MKIVCT(58);     MKIVCT(59);
 695         MKIVCT(60);     MKIVCT(61);     MKIVCT(62);     MKIVCT(63);
 696         MKIVCT(64);     MKIVCT(65);     MKIVCT(66);     MKIVCT(67);
 697         MKIVCT(68);     MKIVCT(69);     MKIVCT(70);     MKIVCT(71);
 698         MKIVCT(72);     MKIVCT(73);     MKIVCT(74);     MKIVCT(75);
 699         MKIVCT(76);     MKIVCT(77);     MKIVCT(78);     MKIVCT(79);
 700         MKIVCT(80);     MKIVCT(81);     MKIVCT(82);     MKIVCT(83);
 701         MKIVCT(84);     MKIVCT(85);     MKIVCT(86);     MKIVCT(87);
 702         MKIVCT(88);     MKIVCT(89);     MKIVCT(90);     MKIVCT(91);
 703         MKIVCT(92);     MKIVCT(93);     MKIVCT(94);     MKIVCT(95);
 704         MKIVCT(96);     MKIVCT(97);     MKIVCT(98);     MKIVCT(99);
 705         MKIVCT(100);    MKIVCT(101);    MKIVCT(102);    MKIVCT(103);
 706         MKIVCT(104);    MKIVCT(105);    MKIVCT(106);    MKIVCT(107);
 707         MKIVCT(108);    MKIVCT(109);    MKIVCT(110);    MKIVCT(111);
 708         MKIVCT(112);    MKIVCT(113);    MKIVCT(114);    MKIVCT(115);
 709         MKIVCT(116);    MKIVCT(117);    MKIVCT(118);    MKIVCT(119);
 710         MKIVCT(120);    MKIVCT(121);    MKIVCT(122);    MKIVCT(123);
 711         MKIVCT(124);    MKIVCT(125);    MKIVCT(126);    MKIVCT(127);
 712         MKIVCT(128);    MKIVCT(129);    MKIVCT(130);    MKIVCT(131);
 713         MKIVCT(132);    MKIVCT(133);    MKIVCT(134);    MKIVCT(135);
 714         MKIVCT(136);    MKIVCT(137);    MKIVCT(138);    MKIVCT(139);
 715         MKIVCT(140);    MKIVCT(141);    MKIVCT(142);    MKIVCT(143);
 716         MKIVCT(144);    MKIVCT(145);    MKIVCT(146);    MKIVCT(147);
 717         MKIVCT(148);    MKIVCT(149);    MKIVCT(150);    MKIVCT(151);
 718         MKIVCT(152);    MKIVCT(153);    MKIVCT(154);    MKIVCT(155);
 719         MKIVCT(156);    MKIVCT(157);    MKIVCT(158);    MKIVCT(159);
 720         MKIVCT(160);    MKIVCT(161);    MKIVCT(162);    MKIVCT(163);
 721         MKIVCT(164);    MKIVCT(165);    MKIVCT(166);    MKIVCT(167);
 722         MKIVCT(168);    MKIVCT(169);    MKIVCT(170);    MKIVCT(171);
 723         MKIVCT(172);    MKIVCT(173);    MKIVCT(174);    MKIVCT(175);
 724         MKIVCT(176);    MKIVCT(177);    MKIVCT(178);    MKIVCT(179);
 725         MKIVCT(180);    MKIVCT(181);    MKIVCT(182);    MKIVCT(183);
 726         MKIVCT(184);    MKIVCT(185);    MKIVCT(186);    MKIVCT(187);
 727         MKIVCT(188);    MKIVCT(189);    MKIVCT(190);    MKIVCT(191);
 728         MKIVCT(192);    MKIVCT(193);    MKIVCT(194);    MKIVCT(195);
 729         MKIVCT(196);    MKIVCT(197);    MKIVCT(198);    MKIVCT(199);
 730         MKIVCT(200);    MKIVCT(201);    MKIVCT(202);    MKIVCT(203);
 731         MKIVCT(204);    MKIVCT(205);    MKIVCT(206);    MKIVCT(207);
 732         MKIVCT(208);    MKIVCT(209);    MKIVCT(210);    MKIVCT(211);
 733         MKIVCT(212);    MKIVCT(213);    MKIVCT(214);    MKIVCT(215);
 734         MKIVCT(216);    MKIVCT(217);    MKIVCT(218);    MKIVCT(219);
 735         MKIVCT(220);    MKIVCT(221);    MKIVCT(222);    MKIVCT(223);
 736         MKIVCT(224);    MKIVCT(225);    MKIVCT(226);    MKIVCT(227);
 737         MKIVCT(228);    MKIVCT(229);    MKIVCT(230);    MKIVCT(231);
 738         MKIVCT(232);    MKIVCT(233);    MKIVCT(234);    MKIVCT(235);
 739         MKIVCT(236);    MKIVCT(237);    MKIVCT(238);    MKIVCT(239);
 740         MKIVCT(240);    MKIVCT(241);    MKIVCT(242);    MKIVCT(243);
 741         MKIVCT(244);    MKIVCT(245);    MKIVCT(246);    MKIVCT(247);
 742         MKIVCT(248);    MKIVCT(249);    MKIVCT(250);    MKIVCT(251);
 743         MKIVCT(252);    MKIVCT(253);    MKIVCT(254);    MKIVCT(255);
 744 
 745         /*
 746          * We're PCIDE, but we don't have INVPCID.  The only way to invalidate a
 747          * PCID other than the current one, then, is to load its cr3 then
 748          * invlpg.  But loading kf_user_cr3 means we can longer access our
 749          * caller's text mapping (or indeed, its stack).  So this little helper
 750          * has to live within our trampoline text region.
 751          *
 752          * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3)
 753          */
 754         ENTRY_NP(tr_mmu_flush_user_range)
 755         push    %rbx
 756         /* When we read cr3, it never has the NOINVL bit set. */
 757         mov     %cr3, %rax
 758         movq    $CR3_NOINVL_BIT, %rbx
 759         orq     %rbx, %rax
 760 
 761         mov     %rcx, %cr3
 762         add     %rdi, %rsi
 763 .align  ASM_ENTRY_ALIGN
 764 1:
 765         invlpg  (%rdi)
 766         add     %rdx, %rdi
 767         cmp     %rsi, %rdi
 768         jb      1b
 769         mov     %rax, %cr3
 770         pop     %rbx
 771         retq
 772         SET_SIZE(tr_mmu_flush_user_range)
 773 
 774 .align MMU_PAGESIZE
 775 .global kpti_tramp_end
 776 kpti_tramp_end:
 777         nop
 778 
 779 #endif  /* __lint */