Print this page
11859 need swapgs mitigation
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /*
  12  * Copyright 2018 Joyent, Inc.
  13  */
  14 
  15 /*
  16  * This file contains the trampolines that are used by KPTI in order to be
  17  * able to take interrupts/trap/etc while on the "user" page table.
  18  *
  19  * We don't map the full kernel text into the user page table: instead we
  20  * map this one small section of trampolines (which compiles to ~13 pages).
  21  * These trampolines are set in the IDT always (so they will run no matter
  22  * whether we're on the kernel or user page table), and their primary job is to
  23  * pivot us to the kernel %cr3 and %rsp without ruining everything.
  24  *
  25  * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
  26  * meaning that they will execute with their %rsp set to a known location, even
  27  * if we take them in the kernel.
  28  *
  29  * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
  30  * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
  31  * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
  32  * page-aligned, and we map the page it's on into both page tables. Using a


  71  * due to a bug in trampoline code, we preserve the original trampoline
  72  * state that caused the trap.
  73  *
  74  * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
  75  * stacks, since they can interrupt another ISR at any time. These stacks are
  76  * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
  77  * their trampolines (and do it unconditionally), and don't bother pivoting
  78  * away. We're either going into the panic() path, or we're going to return
  79  * straight away without rescheduling, so it's fine to not be on our real
  80  * kthread stack (and some of the state we want to go find it with might be
  81  * corrupt!)
  82  *
  83  * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
  84  * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
  85  * point at the PML4 for kas early in boot and never touch it again. Hopefully
  86  * it survives whatever corruption brings down the rest of the kernel!
  87  *
  88  * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
  89  * cases) in that they do not push an interrupt frame (and also have some other
  90  * effects). In the syscall trampolines, we assume that we can only be taking
  91  * the call from userland and use SWAPGS and an unconditional overwrite of %cr3.
  92  * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
  93  * existing %rsp pivot untouched) -- instead we spill registers into
  94  * %gs:CPU_KPTI_* as we need to.
  95  *
  96  * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
  97  * hat_switch().
  98  */
  99 
 100 /*
 101  * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
 102  * fix bugs here check to see if they should be fixed there as well.
 103  */
 104 
 105 #include <sys/asm_linkage.h>
 106 #include <sys/asm_misc.h>
 107 #include <sys/regset.h>
 108 #include <sys/privregs.h>
 109 #include <sys/psw.h>
 110 #include <sys/machbrand.h>
 111 #include <sys/param.h>


 486         mov     %r13, %cr3
 487         mov     %gs:CPU_KPTI_R13, %r13
 488         jmp     _brand_sys_sysenter_post_swapgs
 489         SET_SIZE(tr_brand_sys_sysenter)
 490 
 491 #define MK_SYSCALL_INT_TRAMPOLINE(isr)          \
 492         ENTRY_NP(tr_/**/isr);                   \
 493         swapgs;                                 \
 494         mov     %r13, %gs:CPU_KPTI_R13;         \
 495         SET_KERNEL_CR3(%r13);                   \
 496         mov     %gs:CPU_THREAD, %r13;           \
 497         mov     T_STACK(%r13), %r13;            \
 498         addq    $REGSIZE+MINFRAME, %r13;        \
 499         mov     %r13, %rsp;                     \
 500         pushq   %gs:CPU_KPTI_SS;                \
 501         pushq   %gs:CPU_KPTI_RSP;               \
 502         pushq   %gs:CPU_KPTI_RFLAGS;            \
 503         pushq   %gs:CPU_KPTI_CS;                \
 504         pushq   %gs:CPU_KPTI_RIP;               \
 505         mov     %gs:CPU_KPTI_R13, %r13;         \
 506         SWAPGS;                                 \
 507         jmp     isr;                            \
 508         SET_SIZE(tr_/**/isr)
 509 
 510         MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
 511         MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
 512 
 513         /*
 514          * Interrupt/trap return trampolines
 515          */
 516 
 517 .global tr_intr_ret_start
 518 tr_intr_ret_start:
 519 
 520         ENTRY_NP(tr_iret_auto)
 521         cmpq    $1, kpti_enable
 522         jne     tr_iret_kernel
 523         cmpw    $KCS_SEL, T_FRAMERET_CS(%rsp)
 524         je      tr_iret_kernel
 525         jmp     tr_iret_user
 526         SET_SIZE(tr_iret_auto)
 527 
 528         ENTRY_NP(tr_iret_kernel)
 529         /*
 530          * Yes, this does nothing extra. But this way we know if we see iret
 531          * elsewhere, then we've failed to properly consider trampolines there.
 532          */
 533         iretq
 534         SET_SIZE(tr_iret_kernel)
 535 
 536         ENTRY_NP(tr_iret_user)
 537 #if DEBUG
 538         /*
 539          * Ensure that we return to user land with CR0.TS clear. We do this
 540          * before we trampoline back and pivot the stack and %cr3. This way
 541          * we're still on the kernel stack and kernel %cr3, though we are on the
 542          * user GSBASE.
 543          */
 544         pushq   %rax
 545         mov     %cr0, %rax
 546         testq   $CR0_TS, %rax
 547         jz      1f
 548         swapgs
 549         popq    %rax
 550         leaq    _bad_ts_panic_msg(%rip), %rdi
 551         xorl    %eax, %eax
 552         pushq   %rbp
 553         movq    %rsp, %rbp
 554         call    panic
 555 1:
 556         popq    %rax
 557 #endif
 558 
 559         cmpq    $1, kpti_enable
 560         jne     1f
 561 








 562         swapgs

 563         mov     %r13, %gs:CPU_KPTI_R13
 564         PIVOT_KPTI_STK(%r13)
 565         SET_USER_CR3(%r13)
 566         mov     %gs:CPU_KPTI_R13, %r13
 567         /* Zero these to make sure they didn't leak from a kernel trap */
 568         movq    $0, %gs:CPU_KPTI_R13
 569         movq    $0, %gs:CPU_KPTI_R14

 570         swapgs
 571 1:
 572         iretq
 573         SET_SIZE(tr_iret_user)
 574 
 575         /*
 576          * This special return trampoline is for KDI's use only (with kmdb).
 577          *
 578          * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
 579          * instead. This trampoline runs after GSBASE has already been changed
 580          * back to the userland value (so we can't use %gs).
 581          *
 582          * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
 583          * The KPTI_R13 member in the kpti_dbg has already been set to what the
 584          * real %r13 should be before we IRET.
 585          *
 586          * Additionally, KDI keeps a copy of the incoming %cr3 value when it
 587          * took an interrupt, and has put that back in the kpti_dbg area for us
 588          * to use, so we don't do any sniffing of %cs here. This is important
 589          * so that debugging code that changes %cr3 is possible.


   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 /*
  12  * Copyright 2019 Joyent, Inc.
  13  */
  14 
  15 /*
  16  * This file contains the trampolines that are used by KPTI in order to be
  17  * able to take interrupts/trap/etc while on the "user" page table.
  18  *
  19  * We don't map the full kernel text into the user page table: instead we
  20  * map this one small section of trampolines (which compiles to ~13 pages).
  21  * These trampolines are set in the IDT always (so they will run no matter
  22  * whether we're on the kernel or user page table), and their primary job is to
  23  * pivot us to the kernel %cr3 and %rsp without ruining everything.
  24  *
  25  * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
  26  * meaning that they will execute with their %rsp set to a known location, even
  27  * if we take them in the kernel.
  28  *
  29  * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
  30  * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
  31  * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
  32  * page-aligned, and we map the page it's on into both page tables. Using a


  71  * due to a bug in trampoline code, we preserve the original trampoline
  72  * state that caused the trap.
  73  *
  74  * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
  75  * stacks, since they can interrupt another ISR at any time. These stacks are
  76  * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
  77  * their trampolines (and do it unconditionally), and don't bother pivoting
  78  * away. We're either going into the panic() path, or we're going to return
  79  * straight away without rescheduling, so it's fine to not be on our real
  80  * kthread stack (and some of the state we want to go find it with might be
  81  * corrupt!)
  82  *
  83  * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
  84  * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
  85  * point at the PML4 for kas early in boot and never touch it again. Hopefully
  86  * it survives whatever corruption brings down the rest of the kernel!
  87  *
  88  * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
  89  * cases) in that they do not push an interrupt frame (and also have some other
  90  * effects). In the syscall trampolines, we assume that we can only be taking
  91  * the call from userland and use swapgs and an unconditional overwrite of %cr3.
  92  * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
  93  * existing %rsp pivot untouched) -- instead we spill registers into
  94  * %gs:CPU_KPTI_* as we need to.
  95  *
  96  * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
  97  * hat_switch().
  98  */
  99 
 100 /*
 101  * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
 102  * fix bugs here check to see if they should be fixed there as well.
 103  */
 104 
 105 #include <sys/asm_linkage.h>
 106 #include <sys/asm_misc.h>
 107 #include <sys/regset.h>
 108 #include <sys/privregs.h>
 109 #include <sys/psw.h>
 110 #include <sys/machbrand.h>
 111 #include <sys/param.h>


 486         mov     %r13, %cr3
 487         mov     %gs:CPU_KPTI_R13, %r13
 488         jmp     _brand_sys_sysenter_post_swapgs
 489         SET_SIZE(tr_brand_sys_sysenter)
 490 
 491 #define MK_SYSCALL_INT_TRAMPOLINE(isr)          \
 492         ENTRY_NP(tr_/**/isr);                   \
 493         swapgs;                                 \
 494         mov     %r13, %gs:CPU_KPTI_R13;         \
 495         SET_KERNEL_CR3(%r13);                   \
 496         mov     %gs:CPU_THREAD, %r13;           \
 497         mov     T_STACK(%r13), %r13;            \
 498         addq    $REGSIZE+MINFRAME, %r13;        \
 499         mov     %r13, %rsp;                     \
 500         pushq   %gs:CPU_KPTI_SS;                \
 501         pushq   %gs:CPU_KPTI_RSP;               \
 502         pushq   %gs:CPU_KPTI_RFLAGS;            \
 503         pushq   %gs:CPU_KPTI_CS;                \
 504         pushq   %gs:CPU_KPTI_RIP;               \
 505         mov     %gs:CPU_KPTI_R13, %r13;         \
 506         swapgs;                                 \
 507         jmp     isr;                            \
 508         SET_SIZE(tr_/**/isr)
 509 
 510         MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
 511         MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
 512 
 513         /*
 514          * Interrupt/trap return trampolines
 515          */
 516 
 517 .global tr_intr_ret_start
 518 tr_intr_ret_start:
 519 
 520         ENTRY_NP(tr_iret_auto)
 521         cmpq    $1, kpti_enable
 522         jne     tr_iret_kernel
 523         cmpw    $KCS_SEL, T_FRAMERET_CS(%rsp)
 524         je      tr_iret_kernel
 525         jmp     tr_iret_user
 526         SET_SIZE(tr_iret_auto)
 527 
 528         ENTRY_NP(tr_iret_kernel)
 529         /*
 530          * Yes, this does nothing extra. But this way we know if we see iret
 531          * elsewhere, then we've failed to properly consider trampolines there.
 532          */
 533         iretq
 534         SET_SIZE(tr_iret_kernel)
 535 
 536         ENTRY_NP(tr_iret_user)
 537 #if DEBUG
 538         /*
 539          * Panic if we find CR0.TS set. We're still on the kernel stack and
 540          * %cr3, but we do need to swap back to the kernel gs. (We don't worry
 541          * about swapgs speculation here.)

 542          */
 543         pushq   %rax
 544         mov     %cr0, %rax
 545         testq   $CR0_TS, %rax
 546         jz      1f
 547         swapgs
 548         popq    %rax
 549         leaq    _bad_ts_panic_msg(%rip), %rdi
 550         xorl    %eax, %eax
 551         pushq   %rbp
 552         movq    %rsp, %rbp
 553         call    panic
 554 1:
 555         popq    %rax
 556 #endif
 557 
 558         cmpq    $1, kpti_enable
 559         jne     1f
 560 
 561         /*
 562          * KPTI enabled: we're on the user gsbase at this point, so we
 563          * need to swap back so we can pivot stacks.
 564          *
 565          * The swapgs lfence mitigation is probably not needed here
 566          * since a mis-speculation of the above branch would imply KPTI
 567          * is disabled, but we'll do so anyway.
 568          */
 569         swapgs
 570         lfence
 571         mov     %r13, %gs:CPU_KPTI_R13
 572         PIVOT_KPTI_STK(%r13)
 573         SET_USER_CR3(%r13)
 574         mov     %gs:CPU_KPTI_R13, %r13
 575         /* Zero these to make sure they didn't leak from a kernel trap. */
 576         movq    $0, %gs:CPU_KPTI_R13
 577         movq    $0, %gs:CPU_KPTI_R14
 578         /* And back to user gsbase again. */
 579         swapgs
 580 1:
 581         iretq
 582         SET_SIZE(tr_iret_user)
 583 
 584         /*
 585          * This special return trampoline is for KDI's use only (with kmdb).
 586          *
 587          * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
 588          * instead. This trampoline runs after GSBASE has already been changed
 589          * back to the userland value (so we can't use %gs).
 590          *
 591          * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
 592          * The KPTI_R13 member in the kpti_dbg has already been set to what the
 593          * real %r13 should be before we IRET.
 594          *
 595          * Additionally, KDI keeps a copy of the incoming %cr3 value when it
 596          * took an interrupt, and has put that back in the kpti_dbg area for us
 597          * to use, so we don't do any sniffing of %cs here. This is important
 598          * so that debugging code that changes %cr3 is possible.