1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /*
12 * Copyright 2018 Joyent, Inc.
13 */
14
15 /*
16 * This file contains the trampolines that are used by KPTI in order to be
17 * able to take interrupts/trap/etc while on the "user" page table.
18 *
19 * We don't map the full kernel text into the user page table: instead we
20 * map this one small section of trampolines (which compiles to ~13 pages).
21 * These trampolines are set in the IDT always (so they will run no matter
22 * whether we're on the kernel or user page table), and their primary job is to
23 * pivot us to the kernel %cr3 and %rsp without ruining everything.
24 *
25 * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
26 * meaning that they will execute with their %rsp set to a known location, even
27 * if we take them in the kernel.
28 *
29 * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
30 * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
31 * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
32 * page-aligned, and we map the page it's on into both page tables. Using a
71 * due to a bug in trampoline code, we preserve the original trampoline
72 * state that caused the trap.
73 *
74 * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
75 * stacks, since they can interrupt another ISR at any time. These stacks are
76 * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
77 * their trampolines (and do it unconditionally), and don't bother pivoting
78 * away. We're either going into the panic() path, or we're going to return
79 * straight away without rescheduling, so it's fine to not be on our real
80 * kthread stack (and some of the state we want to go find it with might be
81 * corrupt!)
82 *
83 * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
84 * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
85 * point at the PML4 for kas early in boot and never touch it again. Hopefully
86 * it survives whatever corruption brings down the rest of the kernel!
87 *
88 * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
89 * cases) in that they do not push an interrupt frame (and also have some other
90 * effects). In the syscall trampolines, we assume that we can only be taking
91 * the call from userland and use SWAPGS and an unconditional overwrite of %cr3.
92 * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
93 * existing %rsp pivot untouched) -- instead we spill registers into
94 * %gs:CPU_KPTI_* as we need to.
95 *
96 * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
97 * hat_switch().
98 */
99
100 /*
101 * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
102 * fix bugs here check to see if they should be fixed there as well.
103 */
104
105 #include <sys/asm_linkage.h>
106 #include <sys/asm_misc.h>
107 #include <sys/regset.h>
108 #include <sys/privregs.h>
109 #include <sys/psw.h>
110 #include <sys/machbrand.h>
111 #include <sys/param.h>
486 mov %r13, %cr3
487 mov %gs:CPU_KPTI_R13, %r13
488 jmp _brand_sys_sysenter_post_swapgs
489 SET_SIZE(tr_brand_sys_sysenter)
490
491 #define MK_SYSCALL_INT_TRAMPOLINE(isr) \
492 ENTRY_NP(tr_/**/isr); \
493 swapgs; \
494 mov %r13, %gs:CPU_KPTI_R13; \
495 SET_KERNEL_CR3(%r13); \
496 mov %gs:CPU_THREAD, %r13; \
497 mov T_STACK(%r13), %r13; \
498 addq $REGSIZE+MINFRAME, %r13; \
499 mov %r13, %rsp; \
500 pushq %gs:CPU_KPTI_SS; \
501 pushq %gs:CPU_KPTI_RSP; \
502 pushq %gs:CPU_KPTI_RFLAGS; \
503 pushq %gs:CPU_KPTI_CS; \
504 pushq %gs:CPU_KPTI_RIP; \
505 mov %gs:CPU_KPTI_R13, %r13; \
506 SWAPGS; \
507 jmp isr; \
508 SET_SIZE(tr_/**/isr)
509
510 MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
511 MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
512
513 /*
514 * Interrupt/trap return trampolines
515 */
516
517 .global tr_intr_ret_start
518 tr_intr_ret_start:
519
520 ENTRY_NP(tr_iret_auto)
521 cmpq $1, kpti_enable
522 jne tr_iret_kernel
523 cmpw $KCS_SEL, T_FRAMERET_CS(%rsp)
524 je tr_iret_kernel
525 jmp tr_iret_user
526 SET_SIZE(tr_iret_auto)
527
528 ENTRY_NP(tr_iret_kernel)
529 /*
530 * Yes, this does nothing extra. But this way we know if we see iret
531 * elsewhere, then we've failed to properly consider trampolines there.
532 */
533 iretq
534 SET_SIZE(tr_iret_kernel)
535
536 ENTRY_NP(tr_iret_user)
537 #if DEBUG
538 /*
539 * Ensure that we return to user land with CR0.TS clear. We do this
540 * before we trampoline back and pivot the stack and %cr3. This way
541 * we're still on the kernel stack and kernel %cr3, though we are on the
542 * user GSBASE.
543 */
544 pushq %rax
545 mov %cr0, %rax
546 testq $CR0_TS, %rax
547 jz 1f
548 swapgs
549 popq %rax
550 leaq _bad_ts_panic_msg(%rip), %rdi
551 xorl %eax, %eax
552 pushq %rbp
553 movq %rsp, %rbp
554 call panic
555 1:
556 popq %rax
557 #endif
558
559 cmpq $1, kpti_enable
560 jne 1f
561
562 swapgs
563 mov %r13, %gs:CPU_KPTI_R13
564 PIVOT_KPTI_STK(%r13)
565 SET_USER_CR3(%r13)
566 mov %gs:CPU_KPTI_R13, %r13
567 /* Zero these to make sure they didn't leak from a kernel trap */
568 movq $0, %gs:CPU_KPTI_R13
569 movq $0, %gs:CPU_KPTI_R14
570 swapgs
571 1:
572 iretq
573 SET_SIZE(tr_iret_user)
574
575 /*
576 * This special return trampoline is for KDI's use only (with kmdb).
577 *
578 * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
579 * instead. This trampoline runs after GSBASE has already been changed
580 * back to the userland value (so we can't use %gs).
581 *
582 * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
583 * The KPTI_R13 member in the kpti_dbg has already been set to what the
584 * real %r13 should be before we IRET.
585 *
586 * Additionally, KDI keeps a copy of the incoming %cr3 value when it
587 * took an interrupt, and has put that back in the kpti_dbg area for us
588 * to use, so we don't do any sniffing of %cs here. This is important
589 * so that debugging code that changes %cr3 is possible.
|
1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11 /*
12 * Copyright 2019 Joyent, Inc.
13 */
14
15 /*
16 * This file contains the trampolines that are used by KPTI in order to be
17 * able to take interrupts/trap/etc while on the "user" page table.
18 *
19 * We don't map the full kernel text into the user page table: instead we
20 * map this one small section of trampolines (which compiles to ~13 pages).
21 * These trampolines are set in the IDT always (so they will run no matter
22 * whether we're on the kernel or user page table), and their primary job is to
23 * pivot us to the kernel %cr3 and %rsp without ruining everything.
24 *
25 * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
26 * meaning that they will execute with their %rsp set to a known location, even
27 * if we take them in the kernel.
28 *
29 * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
30 * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
31 * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
32 * page-aligned, and we map the page it's on into both page tables. Using a
71 * due to a bug in trampoline code, we preserve the original trampoline
72 * state that caused the trap.
73 *
74 * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
75 * stacks, since they can interrupt another ISR at any time. These stacks are
76 * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
77 * their trampolines (and do it unconditionally), and don't bother pivoting
78 * away. We're either going into the panic() path, or we're going to return
79 * straight away without rescheduling, so it's fine to not be on our real
80 * kthread stack (and some of the state we want to go find it with might be
81 * corrupt!)
82 *
83 * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
84 * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
85 * point at the PML4 for kas early in boot and never touch it again. Hopefully
86 * it survives whatever corruption brings down the rest of the kernel!
87 *
88 * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
89 * cases) in that they do not push an interrupt frame (and also have some other
90 * effects). In the syscall trampolines, we assume that we can only be taking
91 * the call from userland and use swapgs and an unconditional overwrite of %cr3.
92 * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
93 * existing %rsp pivot untouched) -- instead we spill registers into
94 * %gs:CPU_KPTI_* as we need to.
95 *
96 * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
97 * hat_switch().
98 */
99
100 /*
101 * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
102 * fix bugs here check to see if they should be fixed there as well.
103 */
104
105 #include <sys/asm_linkage.h>
106 #include <sys/asm_misc.h>
107 #include <sys/regset.h>
108 #include <sys/privregs.h>
109 #include <sys/psw.h>
110 #include <sys/machbrand.h>
111 #include <sys/param.h>
486 mov %r13, %cr3
487 mov %gs:CPU_KPTI_R13, %r13
488 jmp _brand_sys_sysenter_post_swapgs
489 SET_SIZE(tr_brand_sys_sysenter)
490
491 #define MK_SYSCALL_INT_TRAMPOLINE(isr) \
492 ENTRY_NP(tr_/**/isr); \
493 swapgs; \
494 mov %r13, %gs:CPU_KPTI_R13; \
495 SET_KERNEL_CR3(%r13); \
496 mov %gs:CPU_THREAD, %r13; \
497 mov T_STACK(%r13), %r13; \
498 addq $REGSIZE+MINFRAME, %r13; \
499 mov %r13, %rsp; \
500 pushq %gs:CPU_KPTI_SS; \
501 pushq %gs:CPU_KPTI_RSP; \
502 pushq %gs:CPU_KPTI_RFLAGS; \
503 pushq %gs:CPU_KPTI_CS; \
504 pushq %gs:CPU_KPTI_RIP; \
505 mov %gs:CPU_KPTI_R13, %r13; \
506 swapgs; \
507 jmp isr; \
508 SET_SIZE(tr_/**/isr)
509
510 MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
511 MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
512
513 /*
514 * Interrupt/trap return trampolines
515 */
516
517 .global tr_intr_ret_start
518 tr_intr_ret_start:
519
520 ENTRY_NP(tr_iret_auto)
521 cmpq $1, kpti_enable
522 jne tr_iret_kernel
523 cmpw $KCS_SEL, T_FRAMERET_CS(%rsp)
524 je tr_iret_kernel
525 jmp tr_iret_user
526 SET_SIZE(tr_iret_auto)
527
528 ENTRY_NP(tr_iret_kernel)
529 /*
530 * Yes, this does nothing extra. But this way we know if we see iret
531 * elsewhere, then we've failed to properly consider trampolines there.
532 */
533 iretq
534 SET_SIZE(tr_iret_kernel)
535
536 ENTRY_NP(tr_iret_user)
537 #if DEBUG
538 /*
539 * Panic if we find CR0.TS set. We're still on the kernel stack and
540 * %cr3, but we do need to swap back to the kernel gs. (We don't worry
541 * about swapgs speculation here.)
542 */
543 pushq %rax
544 mov %cr0, %rax
545 testq $CR0_TS, %rax
546 jz 1f
547 swapgs
548 popq %rax
549 leaq _bad_ts_panic_msg(%rip), %rdi
550 xorl %eax, %eax
551 pushq %rbp
552 movq %rsp, %rbp
553 call panic
554 1:
555 popq %rax
556 #endif
557
558 cmpq $1, kpti_enable
559 jne 1f
560
561 /*
562 * KPTI enabled: we're on the user gsbase at this point, so we
563 * need to swap back so we can pivot stacks.
564 *
565 * The swapgs lfence mitigation is probably not needed here
566 * since a mis-speculation of the above branch would imply KPTI
567 * is disabled, but we'll do so anyway.
568 */
569 swapgs
570 lfence
571 mov %r13, %gs:CPU_KPTI_R13
572 PIVOT_KPTI_STK(%r13)
573 SET_USER_CR3(%r13)
574 mov %gs:CPU_KPTI_R13, %r13
575 /* Zero these to make sure they didn't leak from a kernel trap. */
576 movq $0, %gs:CPU_KPTI_R13
577 movq $0, %gs:CPU_KPTI_R14
578 /* And back to user gsbase again. */
579 swapgs
580 1:
581 iretq
582 SET_SIZE(tr_iret_user)
583
584 /*
585 * This special return trampoline is for KDI's use only (with kmdb).
586 *
587 * KDI/kmdb do not use swapgs -- they directly write the GSBASE MSR
588 * instead. This trampoline runs after GSBASE has already been changed
589 * back to the userland value (so we can't use %gs).
590 *
591 * Instead, the caller gives us a pointer to the kpti_dbg frame in %r13.
592 * The KPTI_R13 member in the kpti_dbg has already been set to what the
593 * real %r13 should be before we IRET.
594 *
595 * Additionally, KDI keeps a copy of the incoming %cr3 value when it
596 * took an interrupt, and has put that back in the kpti_dbg area for us
597 * to use, so we don't do any sniffing of %cs here. This is important
598 * so that debugging code that changes %cr3 is possible.
|