2 New usr/src/uts/i86pc/ml/kpti

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /*
  12  * Copyright 2019 Joyent, Inc.
  13  */
  14 
  15 /*
  16  * This file contains the trampolines that are used by KPTI in order to be
  17  * able to take interrupts/trap/etc while on the "user" page table.
  18  *
  19  * We don't map the full kernel text into the user page table: instead we
  20  * map this one small section of trampolines (which compiles to ~13 pages).
  21  * These trampolines are set in the IDT always (so they will run no matter
  22  * whether we're on the kernel or user page table), and their primary job is to
  23  * pivot us to the kernel %cr3 and %rsp without ruining everything.
  24  *
  25  * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
  26  * meaning that they will execute with their %rsp set to a known location, even
  27  * if we take them in the kernel.
  28  *
  29  * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
  30  * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
  31  * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
  32  * page-aligned, and we map the page it's on into both page tables. Using a
  33  * struct attached to the cpu_t also means that we can use %rsp-relative
  34  * addressing to find anything on the cpu_t, so we don't have to touch %gs or
  35  * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy).
  36  *
  37  * This little struct is where the CPU will push the actual interrupt frame.
  38  * Then, in the trampoline, we change %cr3, then figure out our destination
  39  * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt
  40  * frame). Then we jump to the regular ISR in the kernel text and carry on as
  41  * normal.
  42  *
  43  * We leave the original frame and any spilled regs behind in the kpti_frame
  44  * lazily until we want to return to userland. Then, we clear any spilled
  45  * regs from it, and overwrite the rest with our iret frame. When switching
  46  * this cpu to a different process (in hat_switch), we bzero the whole region to
  47  * make sure nothing can leak between processes.
  48  *
  49  * When we're returning back to the original place we took the interrupt later
  50  * (especially if it was in userland), we have to jmp back to the "return
  51  * trampolines" here, since when we set %cr3 back to the user value, we need to
  52  * be executing from code here in these shared pages and not the main kernel
  53  * text again. Even though it should be fine to iret directly from kernel text
  54  * when returning to kernel code, we make things jmp to a trampoline here just
  55  * for consistency.
  56  *
  57  * Note that with IST, it's very important that we always must have pivoted
  58  * away from the IST stack before we could possibly take any other interrupt
  59  * on the same IST (unless it's an end-of-the-world fault and we don't care
  60  * about coming back from it ever).
  61  *
  62  * This is particularly relevant to the dbgtrap/brktrap trampolines, as they
  63  * regularly have to happen from within trampoline code (e.g. in the sysenter
  64  * single-step case) and then return to the world normally. As a result, these
  65  * two are IST'd to their own kpti_frame right above the normal one (in the same
  66  * page), so they don't clobber their parent interrupt.
  67  *
  68  * To aid with debugging, we also IST the page fault (#PF/pftrap), general
  69  * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to
  70  * their own separate kpti_frame. This ensures that if we take one of these
  71  * due to a bug in trampoline code, we preserve the original trampoline
  72  * state that caused the trap.
  73  *
  74  * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
  75  * stacks, since they can interrupt another ISR at any time. These stacks are
  76  * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
  77  * their trampolines (and do it unconditionally), and don't bother pivoting
  78  * away. We're either going into the panic() path, or we're going to return
  79  * straight away without rescheduling, so it's fine to not be on our real
  80  * kthread stack (and some of the state we want to go find it with might be
  81  * corrupt!)
  82  *
  83  * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
  84  * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
  85  * point at the PML4 for kas early in boot and never touch it again. Hopefully
  86  * it survives whatever corruption brings down the rest of the kernel!
  87  *
  88  * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
  89  * cases) in that they do not push an interrupt frame (and also have some other
  90  * effects). In the syscall trampolines, we assume that we can only be taking
  91  * the call from userland and use swapgs and an unconditional overwrite of %cr3.
  92  * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
  93  * existing %rsp pivot untouched) -- instead we spill registers into
  94  * %gs:CPU_KPTI_* as we need to.
  95  *
  96  * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
  97  * hat_switch().
  98  */
  99 
 100 /*
 101  * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
 102  * fix bugs here check to see if they should be fixed there as well.
 103  */
 104 
 105 #include <sys/asm_linkage.h>
 106 #include <sys/asm_misc.h>
 107 #include <sys/regset.h>
 108 #include <sys/privregs.h>
 109 #include <sys/psw.h>
 110 #include <sys/machbrand.h>
 111 #include <sys/param.h>
 112 
 113 #include <sys/segments.h>
 114 #include <sys/pcb.h>
 115 #include <sys/trap.h>
 116 #include <sys/ftrace.h>
 117 #include <sys/traptrace.h>
 118 #include <sys/clock.h>
 119 #include <sys/model.h>
 120 #include <sys/panic.h>
 121 
 122 #if defined(__xpv)
 123 #include <sys/hypervisor.h>
 124 #endif
 125 
 126 #include "assym.h"
 127 
 128         .data
 129         DGDEF3(kpti_enable, 8, 8)
 130         .fill   1, 8, 1
 131 
 132 #if DEBUG
 133         .data
 134 _bad_ts_panic_msg:
 135         .string "kpti_trampolines.s: tr_iret_user but CR0.TS set"
 136 #endif
 137 
 138 .section ".text";
 139 .align MMU_PAGESIZE
 140 
 141 .global kpti_tramp_start
 142 kpti_tramp_start:
 143         nop
 144 
 145 /* This will be set by mlsetup, and then double-checked later */
 146 .global kpti_safe_cr3
 147 kpti_safe_cr3:
 148         .quad 0
 149         SET_SIZE(kpti_safe_cr3)
 150 
 151 /* startup_kmem() will overwrite this */
 152 .global kpti_kbase
 153 kpti_kbase:
 154         .quad KERNELBASE
 155         SET_SIZE(kpti_kbase)
 156 
 157 #define SET_KERNEL_CR3(spillreg)                \
 158         mov     %cr3, spillreg;                 \
 159         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 160         mov     %gs:CPU_KPTI_KCR3, spillreg;    \
 161         cmp     $0, spillreg;                   \
 162         je      2f;                             \
 163         mov     spillreg, %cr3;                 \
 164 2:
 165 
 166 #if DEBUG
 167 #define SET_USER_CR3(spillreg)                  \
 168         mov     %cr3, spillreg;                 \
 169         mov     spillreg, %gs:CPU_KPTI_TR_CR3;  \
 170         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 171         mov     spillreg, %cr3
 172 #else
 173 #define SET_USER_CR3(spillreg)                  \
 174         mov     %gs:CPU_KPTI_UCR3, spillreg;    \
 175         mov     spillreg, %cr3
 176 #endif
 177 
 178 #define PIVOT_KPTI_STK(spillreg)                \
 179         mov     %rsp, spillreg;                 \
 180         mov     %gs:CPU_KPTI_RET_RSP, %rsp;     \
 181         pushq   T_FRAMERET_SS(spillreg);        \
 182         pushq   T_FRAMERET_RSP(spillreg);       \
 183         pushq   T_FRAMERET_RFLAGS(spillreg);    \
 184         pushq   T_FRAMERET_CS(spillreg);        \
 185         pushq   T_FRAMERET_RIP(spillreg)
 186 
 187 
 188 #define INTERRUPT_TRAMPOLINE_P(errpush) \
 189         pushq   %r13;                           \
 190         pushq   %r14;                           \
 191         subq    $KPTI_R14, %rsp;                \
 192         /* Save current %cr3. */                \
 193         mov     %cr3, %r14;                     \
 194         mov     %r14, KPTI_TR_CR3(%rsp);        \
 195                                                 \
 196         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 197         je      3f;                             \
 198 1:                                              \
 199         /* Change to the "kernel" %cr3 */       \
 200         mov     KPTI_KCR3(%rsp), %r14;          \
 201         cmp     $0, %r14;                       \
 202         je      2f;                             \
 203         mov     %r14, %cr3;                     \
 204 2:                                              \
 205         /* Get our cpu_t in %r13 */             \
 206         mov     %rsp, %r13;                     \
 207         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 208         subq    $CPU_KPTI_START, %r13;          \
 209         /* Use top of the kthread stk */        \
 210         mov     CPU_THREAD(%r13), %r14;         \
 211         mov     T_STACK(%r14), %r14;            \
 212         addq    $REGSIZE+MINFRAME, %r14;        \
 213         jmp     4f;                             \
 214 3:                                              \
 215         /* Check the %rsp in the frame. */      \
 216         /* Is it above kernel base? */          \
 217         mov     kpti_kbase, %r14;               \
 218         cmp     %r14, KPTI_RSP(%rsp);           \
 219         jb      1b;                             \
 220         /* Use the %rsp from the trap frame */  \
 221         mov     KPTI_RSP(%rsp), %r14;           \
 222         and     $(~0xf), %r14;                  \
 223 4:                                              \
 224         mov     %rsp, %r13;                     \
 225         /* %r14 contains our destination stk */ \
 226         mov     %r14, %rsp;                     \
 227         pushq   KPTI_SS(%r13);                  \
 228         pushq   KPTI_RSP(%r13);                 \
 229         pushq   KPTI_RFLAGS(%r13);              \
 230         pushq   KPTI_CS(%r13);                  \
 231         pushq   KPTI_RIP(%r13);                 \
 232         errpush;                                \
 233         mov     KPTI_R14(%r13), %r14;           \
 234         mov     KPTI_R13(%r13), %r13
 235 
 236 #define INTERRUPT_TRAMPOLINE_NOERR              \
 237         INTERRUPT_TRAMPOLINE_P(/**/)
 238 
 239 #define INTERRUPT_TRAMPOLINE                    \
 240         INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 241 
 242 /*
 243  * This is used for all interrupts that can plausibly be taken inside another
 244  * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
 245  *
 246  * We also use this for #NP, even though it uses the standard IST: the
 247  * additional %rsp checks below will catch when we get an exception doing an
 248  * iret to userspace with a bad %cs/%ss.  This appears as a kernel trap, and
 249  * only later gets redirected via kern_gpfault().
 250  *
 251  * We check for whether we took the interrupt while in another trampoline, in
 252  * which case we need to use the kthread stack.
 253  */
 254 #define DBG_INTERRUPT_TRAMPOLINE_P(errpush)     \
 255         pushq   %r13;                           \
 256         pushq   %r14;                           \
 257         subq    $KPTI_R14, %rsp;                \
 258         /* Check for clobbering */              \
 259         cmp     $0, KPTI_FLAG(%rsp);            \
 260         je      1f;                             \
 261         /* Don't worry, this totally works */   \
 262         int     $8;                             \
 263 1:                                              \
 264         movq    $1, KPTI_FLAG(%rsp);            \
 265         /* Save current %cr3. */                \
 266         mov     %cr3, %r14;                     \
 267         mov     %r14, KPTI_TR_CR3(%rsp);        \
 268                                                 \
 269         cmpw    $KCS_SEL, KPTI_CS(%rsp);        \
 270         je      4f;                             \
 271 2:                                              \
 272         /* Change to the "kernel" %cr3 */       \
 273         mov     KPTI_KCR3(%rsp), %r14;          \
 274         cmp     $0, %r14;                       \
 275         je      3f;                             \
 276         mov     %r14, %cr3;                     \
 277 3:                                              \
 278         /* Get our cpu_t in %r13 */             \
 279         mov     %rsp, %r13;                     \
 280         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 281         subq    $CPU_KPTI_START, %r13;          \
 282         /* Use top of the kthread stk */        \
 283         mov     CPU_THREAD(%r13), %r14;         \
 284         mov     T_STACK(%r14), %r14;            \
 285         addq    $REGSIZE+MINFRAME, %r14;        \
 286         jmp     6f;                             \
 287 4:                                              \
 288         /* Check the %rsp in the frame. */      \
 289         /* Is it above kernel base? */          \
 290         /* If not, treat as user. */            \
 291         mov     kpti_kbase, %r14;               \
 292         cmp     %r14, KPTI_RSP(%rsp);           \
 293         jb      2b;                             \
 294         /* Is it within the kpti_frame page? */ \
 295         /* If it is, treat as user interrupt */ \
 296         mov     %rsp, %r13;                     \
 297         and     $(~(MMU_PAGESIZE - 1)), %r13;   \
 298         mov     KPTI_RSP(%rsp), %r14;           \
 299         and     $(~(MMU_PAGESIZE - 1)), %r14;   \
 300         cmp     %r13, %r14;                     \
 301         je      2b;                             \
 302         /* Were we in trampoline code? */       \
 303         leaq    kpti_tramp_start, %r14;         \
 304         cmp     %r14, KPTI_RIP(%rsp);           \
 305         jb      5f;                             \
 306         leaq    kpti_tramp_end, %r14;           \
 307         cmp     %r14, KPTI_RIP(%rsp);           \
 308         ja      5f;                             \
 309         /* If we were, change %cr3: we might */ \
 310         /* have interrupted before it did. */   \
 311         mov     KPTI_KCR3(%rsp), %r14;          \
 312         mov     %r14, %cr3;                     \
 313 5:                                              \
 314         /* Use the %rsp from the trap frame */  \
 315         mov     KPTI_RSP(%rsp), %r14;           \
 316         and     $(~0xf), %r14;                  \
 317 6:                                              \
 318         mov     %rsp, %r13;                     \
 319         /* %r14 contains our destination stk */ \
 320         mov     %r14, %rsp;                     \
 321         pushq   KPTI_SS(%r13);                  \
 322         pushq   KPTI_RSP(%r13);                 \
 323         pushq   KPTI_RFLAGS(%r13);              \
 324         pushq   KPTI_CS(%r13);                  \
 325         pushq   KPTI_RIP(%r13);                 \
 326         errpush;                                \
 327         mov     KPTI_R14(%r13), %r14;           \
 328         movq    $0, KPTI_FLAG(%r13);            \
 329         mov     KPTI_R13(%r13), %r13
 330 
 331 #define DBG_INTERRUPT_TRAMPOLINE_NOERR          \
 332         DBG_INTERRUPT_TRAMPOLINE_P(/**/)
 333 
 334 #define DBG_INTERRUPT_TRAMPOLINE                \
 335         DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
 336 
 337         /*
 338          * These labels (_start and _end) are used by trap.c to determine if
 339          * we took an interrupt like an NMI during the return process.
 340          */
 341 .global tr_sysc_ret_start
 342 tr_sysc_ret_start:
 343 
 344         /*
 345          * Syscall return trampolines.
 346          *
 347          * These are expected to be called on the kernel %gs. tr_sysret[ql] are
 348          * called after %rsp is changed back to the user value, so we have no
 349          * stack to work with. tr_sysexit has a kernel stack (but has to
 350          * preserve rflags, soooo).
 351          */
 352         ENTRY_NP(tr_sysretq)
 353         cmpq    $1, kpti_enable
 354         jne     1f
 355 
 356         mov     %r13, %gs:CPU_KPTI_R13
 357         SET_USER_CR3(%r13)
 358         mov     %gs:CPU_KPTI_R13, %r13
 359         /* Zero these to make sure they didn't leak from a kernel trap */
 360         movq    $0, %gs:CPU_KPTI_R13
 361         movq    $0, %gs:CPU_KPTI_R14
 362 1:
 363         swapgs
 364         sysretq
 365         SET_SIZE(tr_sysretq)
 366 
 367         ENTRY_NP(tr_sysretl)
 368         cmpq    $1, kpti_enable
 369         jne     1f
 370 
 371         mov     %r13, %gs:CPU_KPTI_R13
 372         SET_USER_CR3(%r13)
 373         mov     %gs:CPU_KPTI_R13, %r13
 374         /* Zero these to make sure they didn't leak from a kernel trap */
 375         movq    $0, %gs:CPU_KPTI_R13
 376         movq    $0, %gs:CPU_KPTI_R14
 377 1:
 378         SWAPGS
 379         SYSRETL
 380         SET_SIZE(tr_sysretl)
 381 
 382         ENTRY_NP(tr_sysexit)
 383         /*
 384          * Note: we want to preserve RFLAGS across this branch, since sysexit
 385          * (unlike sysret above) does not restore RFLAGS for us.
 386          *
 387          * We still have the real kernel stack (sysexit does restore that), so
 388          * we can use pushfq/popfq.
 389          */
 390         pushfq
 391 
 392         cmpq    $1, kpti_enable
 393         jne     1f
 394 
 395         /* Have to pop it back off now before we change %cr3! */
 396         popfq
 397         mov     %r13, %gs:CPU_KPTI_R13
 398         SET_USER_CR3(%r13)
 399         mov     %gs:CPU_KPTI_R13, %r13
 400         /* Zero these to make sure they didn't leak from a kernel trap */
 401         movq    $0, %gs:CPU_KPTI_R13
 402         movq    $0, %gs:CPU_KPTI_R14
 403         jmp     2f
 404 1:
 405         popfq
 406 2:
 407         swapgs
 408         sti
 409         sysexit
 410         SET_SIZE(tr_sysexit)
 411 
 412 .global tr_sysc_ret_end
 413 tr_sysc_ret_end:
 414 
 415         /*
 416          * Syscall entry trampolines.
 417          */
 418 
 419 #if DEBUG
 420 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 421         ENTRY_NP(tr_/**/isr);                   \
 422         swapgs;                                 \
 423         mov     %r13, %gs:CPU_KPTI_R13;         \
 424         mov     %cr3, %r13;                     \
 425         mov     %r13, %gs:CPU_KPTI_TR_CR3;      \
 426         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 427         mov     %r13, %cr3;                     \
 428         mov     %gs:CPU_KPTI_R13, %r13;         \
 429         swapgs;                                 \
 430         jmp     isr;                            \
 431         SET_SIZE(tr_/**/isr)
 432 #else
 433 #define MK_SYSCALL_TRAMPOLINE(isr)              \
 434         ENTRY_NP(tr_/**/isr);                   \
 435         swapgs;                                 \
 436         mov     %r13, %gs:CPU_KPTI_R13;         \
 437         mov     %gs:CPU_KPTI_KCR3, %r13;        \
 438         mov     %r13, %cr3;                     \
 439         mov     %gs:CPU_KPTI_R13, %r13;         \
 440         swapgs;                                 \
 441         jmp     isr;                            \
 442         SET_SIZE(tr_/**/isr)
 443 #endif
 444 
 445         MK_SYSCALL_TRAMPOLINE(sys_syscall)
 446         MK_SYSCALL_TRAMPOLINE(sys_syscall32)
 447         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall)
 448         MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32)
 449 
 450         /*
 451          * SYSENTER is special. The CPU is really not very helpful when it
 452          * comes to preserving and restoring state with it, and as a result
 453          * we have to do all of it by hand. So, since we want to preserve
 454          * RFLAGS, we have to be very careful in these trampolines to not
 455          * clobber any bits in it. That means no cmpqs or branches!
 456          */
 457         ENTRY_NP(tr_sys_sysenter)
 458         swapgs
 459         mov     %r13, %gs:CPU_KPTI_R13
 460 #if DEBUG
 461         mov     %cr3, %r13
 462         mov     %r13, %gs:CPU_KPTI_TR_CR3
 463 #endif
 464         mov     %gs:CPU_KPTI_KCR3, %r13
 465         mov     %r13, %cr3
 466         mov     %gs:CPU_KPTI_R13, %r13
 467         jmp     _sys_sysenter_post_swapgs
 468         SET_SIZE(tr_sys_sysenter)
 469 
 470         ENTRY_NP(tr_brand_sys_sysenter)
 471         swapgs
 472         mov     %r13, %gs:CPU_KPTI_R13
 473 #if DEBUG
 474         mov     %cr3, %r13
 475         mov     %r13, %gs:CPU_KPTI_TR_CR3
 476 #endif
 477         mov     %gs:CPU_KPTI_KCR3, %r13
 478         mov     %r13, %cr3
 479         mov     %gs:CPU_KPTI_R13, %r13
 480         jmp     _brand_sys_sysenter_post_swapgs
 481         SET_SIZE(tr_brand_sys_sysenter)
 482 
 483 #define MK_SYSCALL_INT_TRAMPOLINE(isr)          \
 484         ENTRY_NP(tr_/**/isr);                   \
 485         swapgs;                                 \
 486         mov     %r13, %gs:CPU_KPTI_R13;         \
 487         SET_KERNEL_CR3(%r13);                   \
 488         mov     %gs:CPU_THREAD, %r13;           \
 489         mov     T_STACK(%r13), %r13;            \
 490         addq    $REGSIZE+MINFRAME, %r13;        \
 491         mov     %r13, %rsp;                     \
 492         pushq   %gs:CPU_KPTI_SS;                \
 493         pushq   %gs:CPU_KPTI_RSP;               \
 494         pushq   %gs:CPU_KPTI_RFLAGS;            \
 495         pushq   %gs:CPU_KPTI_CS;                \
 496         pushq   %gs:CPU_KPTI_RIP;               \
 497         mov     %gs:CPU_KPTI_R13, %r13;         \
 498         swapgs;                                 \
 499         jmp     isr;                            \
 500         SET_SIZE(tr_/**/isr)
 501 
 502         MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
 503         MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
 504 
 505         /*
 506          * Interrupt/trap return trampolines
 507          */
 508 
 509 .global tr_intr_ret_start
 510 tr_intr_ret_start:
 511 
 512         ENTRY_NP(tr_iret_auto)
 513         cmpq    $1, kpti_enable
 514         jne     tr_iret_kernel
 515         cmpw    $KCS_SEL, T_FRAMERET_CS(%rsp)
 516         je      tr_iret_kernel
 517         jmp     tr_iret_user
 518         SET_SIZE(tr_iret_auto)
 519 
 520         ENTRY_NP(tr_iret_kernel)
 521         /*
 522          * Yes, this does nothing extra. But this way we know if we see iret
 523          * elsewhere, then we've failed to properly consider trampolines there.
 524          */
 525         iretq
 526         SET_SIZE(tr_iret_kernel)
 527 
 528         ENTRY_NP(tr_iret_user)
 529 #if DEBUG
 530         /*
 531          * Panic if we find CR0.TS set. We're still on the kernel stack and
 532          * %cr3, but we do need to swap back to the kernel gs. (We don't worry
 533          * about swapgs speculation here.)
 534          */
 535         pushq   %rax
 536         mov     %cr0, %rax
 537         testq   $CR0_TS, %rax
 538         jz      1f
 539         swapgs
 540         popq    %rax
 541         leaq    _bad_ts_panic_msg(%rip), %rdi
 542         xorl    %eax, %eax
 543         pushq   %rbp
 544         movq    %rsp, %rbp
 545         call    panic
 546 1:
 547         popq    %rax
 548 #endif
 549 
 550         cmpq    $1, kpti_enable
 551         jne     1f
 552 
 553         /*
 554          * KPTI enabled: we're on the user gsbase at this point, so we
 555          * need to swap back so we can pivot stacks.
 556          *
 557          * The swapgs lfence mitigation is probably not needed here
 558          * since a mis-speculation of the above branch would imply KPTI
 559          * is disabled, but we'll do so anyway.
 560          */
 561         swapgs
 562         lfence
 563         mov     %r13, %gs:CPU_KPTI_R13
 564         PIVOT_KPTI_STK(%r13)
 565         SET_USER_CR3(%r13)
 566         mov     %gs:CPU_KPTI_R13, %r13
 567         /* Zero these to make sure they didn't leak from a kernel trap. */
 568         movq    $0, %gs:CPU_KPTI_R13
 569         movq    $0, %gs:CPU_KPTI_R14
 570         /* And back to user gsbase again. */
 571         swapgs
 572 1:
 573         iretq
 574         SET_SIZE(tr_iret_user)
 575 
 576         /*
 577          * This special return trampoline is for KDI's use only (with kmdb).
 578          *
 579          * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
 580          * instead. This trampoline runs after GSBASE has already been changed
 581          * back to the userland value (so we can't use %gs).
 582          *
 583          * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
 584          * The KPTI_R13 member in the kpti_dbg has already been set to what the
 585          * real %r13 should be before we IRET.
 586          *
 587          * Additionally, KDI keeps a copy of the incoming %cr3 value when it
 588          * took an interrupt, and has put that back in the kpti_dbg area for us
 589          * to use, so we don't do any sniffing of %cs here. This is important
 590          * so that debugging code that changes %cr3 is possible.
 591          */
 592         ENTRY_NP(tr_iret_kdi)
 593         movq    %r14, KPTI_R14(%r13)    /* %r14 has to be preserved by us */
 594 
 595         movq    %rsp, %r14      /* original %rsp is pointing at IRET frame */
 596         leaq    KPTI_TOP(%r13), %rsp
 597         pushq   T_FRAMERET_SS(%r14)
 598         pushq   T_FRAMERET_RSP(%r14)
 599         pushq   T_FRAMERET_RFLAGS(%r14)
 600         pushq   T_FRAMERET_CS(%r14)
 601         pushq   T_FRAMERET_RIP(%r14)
 602 
 603         movq    KPTI_TR_CR3(%r13), %r14
 604         movq    %r14, %cr3
 605 
 606         movq    KPTI_R14(%r13), %r14
 607         movq    KPTI_R13(%r13), %r13    /* preserved by our caller */
 608 
 609         iretq
 610         SET_SIZE(tr_iret_kdi)
 611 
 612 .global tr_intr_ret_end
 613 tr_intr_ret_end:
 614 
 615         /*
 616          * Interrupt/trap entry trampolines
 617          */
 618 
 619         /* CPU pushed an error code, and ISR wants one */
 620 #define MK_INTR_TRAMPOLINE(isr)                 \
 621         ENTRY_NP(tr_/**/isr);                   \
 622         INTERRUPT_TRAMPOLINE;                   \
 623         jmp     isr;                            \
 624         SET_SIZE(tr_/**/isr)
 625 
 626         /* CPU didn't push an error code, and ISR doesn't want one */
 627 #define MK_INTR_TRAMPOLINE_NOERR(isr)           \
 628         ENTRY_NP(tr_/**/isr);                   \
 629         push    $0;                             \
 630         INTERRUPT_TRAMPOLINE_NOERR;             \
 631         jmp     isr;                            \
 632         SET_SIZE(tr_/**/isr)
 633 
 634         /* CPU pushed an error code, and ISR wants one */
 635 #define MK_DBG_INTR_TRAMPOLINE(isr)     \
 636         ENTRY_NP(tr_/**/isr);                   \
 637         DBG_INTERRUPT_TRAMPOLINE;               \
 638         jmp     isr;                            \
 639         SET_SIZE(tr_/**/isr)
 640 
 641         /* CPU didn't push an error code, and ISR doesn't want one */
 642 #define MK_DBG_INTR_TRAMPOLINE_NOERR(isr)       \
 643         ENTRY_NP(tr_/**/isr);                   \
 644         push    $0;                             \
 645         DBG_INTERRUPT_TRAMPOLINE_NOERR;         \
 646         jmp     isr;                            \
 647         SET_SIZE(tr_/**/isr)
 648 
 649 
 650         MK_INTR_TRAMPOLINE_NOERR(div0trap)
 651         MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap)
 652         MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap)
 653         MK_INTR_TRAMPOLINE_NOERR(ovflotrap)
 654         MK_INTR_TRAMPOLINE_NOERR(boundstrap)
 655         MK_INTR_TRAMPOLINE_NOERR(invoptrap)
 656         MK_INTR_TRAMPOLINE_NOERR(ndptrap)
 657         MK_INTR_TRAMPOLINE(invtsstrap)
 658         MK_DBG_INTR_TRAMPOLINE(segnptrap)
 659         MK_DBG_INTR_TRAMPOLINE(stktrap)
 660         MK_DBG_INTR_TRAMPOLINE(gptrap)
 661         MK_DBG_INTR_TRAMPOLINE(pftrap)
 662         MK_INTR_TRAMPOLINE_NOERR(resvtrap)
 663         MK_INTR_TRAMPOLINE_NOERR(ndperr)
 664         MK_INTR_TRAMPOLINE(achktrap)
 665         MK_INTR_TRAMPOLINE_NOERR(xmtrap)
 666         MK_INTR_TRAMPOLINE_NOERR(invaltrap)
 667         MK_INTR_TRAMPOLINE_NOERR(fasttrap)
 668         MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
 669 
 670         /*
 671          * These are special because they can interrupt other traps, and
 672          * each other. We don't need to pivot their stacks, because they have
 673          * dedicated IST stack space, but we need to change %cr3.
 674          */
 675         ENTRY_NP(tr_nmiint)
 676         pushq   %r13
 677         mov     kpti_safe_cr3, %r13
 678         mov     %r13, %cr3
 679         popq    %r13
 680         jmp     nmiint
 681         SET_SIZE(tr_nmiint)
 682 
 683 #if !defined(__xpv)
 684         ENTRY_NP(tr_syserrtrap)
 685         /*
 686          * If we got here we should always have a zero error code pushed.
 687          * The INT $0x8 instr doesn't seem to push one, though, which we use
 688          * as an emergency panic in the other trampolines. So adjust things
 689          * here.
 690          */
 691         cmpq    $0, (%rsp)
 692         je      1f
 693         pushq   $0
 694 1:
 695         pushq   %r13
 696         mov     kpti_safe_cr3, %r13
 697         mov     %r13, %cr3
 698         popq    %r13
 699         jmp     syserrtrap
 700         SET_SIZE(tr_syserrtrap)
 701 #endif
 702 
 703         ENTRY_NP(tr_mcetrap)
 704         pushq   %r13
 705         mov     kpti_safe_cr3, %r13
 706         mov     %r13, %cr3
 707         popq    %r13
 708         jmp     mcetrap
 709         SET_SIZE(tr_mcetrap)
 710 
 711         /*
 712          * Interrupts start at 32
 713          */
 714 #define MKIVCT(n)                       \
 715         ENTRY_NP(tr_ivct/**/n)          \
 716         push    $0;                     \
 717         INTERRUPT_TRAMPOLINE;           \
 718         push    $n - 0x20;              \
 719         jmp     cmnint;                 \
 720         SET_SIZE(tr_ivct/**/n)
 721 
 722         MKIVCT(32);     MKIVCT(33);     MKIVCT(34);     MKIVCT(35);
 723         MKIVCT(36);     MKIVCT(37);     MKIVCT(38);     MKIVCT(39);
 724         MKIVCT(40);     MKIVCT(41);     MKIVCT(42);     MKIVCT(43);
 725         MKIVCT(44);     MKIVCT(45);     MKIVCT(46);     MKIVCT(47);
 726         MKIVCT(48);     MKIVCT(49);     MKIVCT(50);     MKIVCT(51);
 727         MKIVCT(52);     MKIVCT(53);     MKIVCT(54);     MKIVCT(55);
 728         MKIVCT(56);     MKIVCT(57);     MKIVCT(58);     MKIVCT(59);
 729         MKIVCT(60);     MKIVCT(61);     MKIVCT(62);     MKIVCT(63);
 730         MKIVCT(64);     MKIVCT(65);     MKIVCT(66);     MKIVCT(67);
 731         MKIVCT(68);     MKIVCT(69);     MKIVCT(70);     MKIVCT(71);
 732         MKIVCT(72);     MKIVCT(73);     MKIVCT(74);     MKIVCT(75);
 733         MKIVCT(76);     MKIVCT(77);     MKIVCT(78);     MKIVCT(79);
 734         MKIVCT(80);     MKIVCT(81);     MKIVCT(82);     MKIVCT(83);
 735         MKIVCT(84);     MKIVCT(85);     MKIVCT(86);     MKIVCT(87);
 736         MKIVCT(88);     MKIVCT(89);     MKIVCT(90);     MKIVCT(91);
 737         MKIVCT(92);     MKIVCT(93);     MKIVCT(94);     MKIVCT(95);
 738         MKIVCT(96);     MKIVCT(97);     MKIVCT(98);     MKIVCT(99);
 739         MKIVCT(100);    MKIVCT(101);    MKIVCT(102);    MKIVCT(103);
 740         MKIVCT(104);    MKIVCT(105);    MKIVCT(106);    MKIVCT(107);
 741         MKIVCT(108);    MKIVCT(109);    MKIVCT(110);    MKIVCT(111);
 742         MKIVCT(112);    MKIVCT(113);    MKIVCT(114);    MKIVCT(115);
 743         MKIVCT(116);    MKIVCT(117);    MKIVCT(118);    MKIVCT(119);
 744         MKIVCT(120);    MKIVCT(121);    MKIVCT(122);    MKIVCT(123);
 745         MKIVCT(124);    MKIVCT(125);    MKIVCT(126);    MKIVCT(127);
 746         MKIVCT(128);    MKIVCT(129);    MKIVCT(130);    MKIVCT(131);
 747         MKIVCT(132);    MKIVCT(133);    MKIVCT(134);    MKIVCT(135);
 748         MKIVCT(136);    MKIVCT(137);    MKIVCT(138);    MKIVCT(139);
 749         MKIVCT(140);    MKIVCT(141);    MKIVCT(142);    MKIVCT(143);
 750         MKIVCT(144);    MKIVCT(145);    MKIVCT(146);    MKIVCT(147);
 751         MKIVCT(148);    MKIVCT(149);    MKIVCT(150);    MKIVCT(151);
 752         MKIVCT(152);    MKIVCT(153);    MKIVCT(154);    MKIVCT(155);
 753         MKIVCT(156);    MKIVCT(157);    MKIVCT(158);    MKIVCT(159);
 754         MKIVCT(160);    MKIVCT(161);    MKIVCT(162);    MKIVCT(163);
 755         MKIVCT(164);    MKIVCT(165);    MKIVCT(166);    MKIVCT(167);
 756         MKIVCT(168);    MKIVCT(169);    MKIVCT(170);    MKIVCT(171);
 757         MKIVCT(172);    MKIVCT(173);    MKIVCT(174);    MKIVCT(175);
 758         MKIVCT(176);    MKIVCT(177);    MKIVCT(178);    MKIVCT(179);
 759         MKIVCT(180);    MKIVCT(181);    MKIVCT(182);    MKIVCT(183);
 760         MKIVCT(184);    MKIVCT(185);    MKIVCT(186);    MKIVCT(187);
 761         MKIVCT(188);    MKIVCT(189);    MKIVCT(190);    MKIVCT(191);
 762         MKIVCT(192);    MKIVCT(193);    MKIVCT(194);    MKIVCT(195);
 763         MKIVCT(196);    MKIVCT(197);    MKIVCT(198);    MKIVCT(199);
 764         MKIVCT(200);    MKIVCT(201);    MKIVCT(202);    MKIVCT(203);
 765         MKIVCT(204);    MKIVCT(205);    MKIVCT(206);    MKIVCT(207);
 766         MKIVCT(208);    MKIVCT(209);    MKIVCT(210);    MKIVCT(211);
 767         MKIVCT(212);    MKIVCT(213);    MKIVCT(214);    MKIVCT(215);
 768         MKIVCT(216);    MKIVCT(217);    MKIVCT(218);    MKIVCT(219);
 769         MKIVCT(220);    MKIVCT(221);    MKIVCT(222);    MKIVCT(223);
 770         MKIVCT(224);    MKIVCT(225);    MKIVCT(226);    MKIVCT(227);
 771         MKIVCT(228);    MKIVCT(229);    MKIVCT(230);    MKIVCT(231);
 772         MKIVCT(232);    MKIVCT(233);    MKIVCT(234);    MKIVCT(235);
 773         MKIVCT(236);    MKIVCT(237);    MKIVCT(238);    MKIVCT(239);
 774         MKIVCT(240);    MKIVCT(241);    MKIVCT(242);    MKIVCT(243);
 775         MKIVCT(244);    MKIVCT(245);    MKIVCT(246);    MKIVCT(247);
 776         MKIVCT(248);    MKIVCT(249);    MKIVCT(250);    MKIVCT(251);
 777         MKIVCT(252);    MKIVCT(253);    MKIVCT(254);    MKIVCT(255);
 778 
 779         /*
 780          * We're PCIDE, but we don't have INVPCID.  The only way to invalidate a
 781          * PCID other than the current one, then, is to load its cr3 then
 782          * invlpg.  But loading kf_user_cr3 means we can longer access our
 783          * caller's text mapping (or indeed, its stack).  So this little helper
 784          * has to live within our trampoline text region.
 785          *
 786          * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3)
 787          */
 788         ENTRY_NP(tr_mmu_flush_user_range)
 789         push    %rbx
 790         /* When we read cr3, it never has the NOINVL bit set. */
 791         mov     %cr3, %rax
 792         movq    $CR3_NOINVL_BIT, %rbx
 793         orq     %rbx, %rax
 794 
 795         mov     %rcx, %cr3
 796         add     %rdi, %rsi
 797 .align  ASM_ENTRY_ALIGN
 798 1:
 799         invlpg  (%rdi)
 800         add     %rdx, %rdi
 801         cmp     %rsi, %rdi
 802         jb      1b
 803         mov     %rax, %cr3
 804         pop     %rbx
 805         retq
 806         SET_SIZE(tr_mmu_flush_user_range)
 807 
 808 .align MMU_PAGESIZE
 809 .global kpti_tramp_end
 810 kpti_tramp_end:
 811         nop
 812