Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

@@ -18,11 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/asm_linkage.h>
 #include <sys/asm_misc.h>

@@ -489,10 +489,24 @@
         movq    %rbx, REGOFF_FS(%rsp)
         movw    %gs, %bx
         movq    %rbx, REGOFF_GS(%rsp)
 
         /*
+         * If we're trying to use TRAPTRACE though, I take that back: we're
+         * probably debugging some problem in the SWAPGS logic and want to know
+         * what the incoming gsbase was.
+         *
+         * Since we already did SWAPGS, record the KGSBASE.
+         */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+        movl    $MSR_AMD_KGSBASE, %ecx
+        rdmsr
+        movl    %eax, REGOFF_GSBASE(%rsp)
+        movl    %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+        /*
          * Machine state saved in the regs structure on the stack
          * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
          * %eax is the syscall number
          * %rsp is the thread's stack, %r15 is curthread
          * REG_RSP(%rsp) is the user's stack

@@ -669,12 +683,11 @@
          */
         ASSERT_UPCALL_MASK_IS_SET
         SYSRETQ
 #else
         ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
-        SWAPGS                          /* user gsbase */
-        SYSRETQ
+        jmp     tr_sysretq
 #endif
         /*NOTREACHED*/
         SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
 
 _syscall_pre:

@@ -771,10 +784,24 @@
         movq    %rbx, REGOFF_FS(%rsp)
         movw    %gs, %bx
         movq    %rbx, REGOFF_GS(%rsp)
 
         /*
+         * If we're trying to use TRAPTRACE though, I take that back: we're
+         * probably debugging some problem in the SWAPGS logic and want to know
+         * what the incoming gsbase was.
+         *
+         * Since we already did SWAPGS, record the KGSBASE.
+         */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+        movl    $MSR_AMD_KGSBASE, %ecx
+        rdmsr
+        movl    %eax, REGOFF_GSBASE(%rsp)
+        movl    %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+        /*
          * Application state saved in the regs structure on the stack
          * %eax is the syscall number
          * %rsp is the thread's stack, %r15 is curthread
          * REG_RSP(%rsp) is the user's stack
          */

@@ -887,12 +914,11 @@
         movl    REGOFF_RIP(%rsp), %ecx          /* %ecx -> %eip */
         movl    REGOFF_RSP(%rsp), %esp
 
         ASSERT_UPCALL_MASK_IS_SET
         ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
-        SWAPGS                          /* user gsbase */
-        SYSRETL
+        jmp     tr_sysretl
         SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
         /*NOTREACHED*/
 
 _full_syscall_postsys32:
         STI

@@ -933,27 +959,26 @@
  *
  * Note that we are unable to return both "rvals" to userland with
  * this call, as %edx is used by the sysexit instruction.
  *
  * One final complication in this routine is its interaction with
- * single-stepping in a debugger.  For most of the system call mechanisms,
- * the CPU automatically clears the single-step flag before we enter the
- * kernel.  The sysenter mechanism does not clear the flag, so a user
- * single-stepping through a libc routine may suddenly find themself
- * single-stepping through the kernel.  To detect this, kmdb compares the
- * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
- * If it finds that we have single-stepped to a sysenter entry point, it
- * explicitly clears the flag and executes the sys_sysenter routine.
+ * single-stepping in a debugger.  For most of the system call mechanisms, the
+ * CPU automatically clears the single-step flag before we enter the kernel.
+ * The sysenter mechanism does not clear the flag, so a user single-stepping
+ * through a libc routine may suddenly find themself single-stepping through the
+ * kernel.  To detect this, kmdb and trap() both compare the trap %pc to the
+ * [brand_]sys_enter addresses on each single-step trap.  If it finds that we
+ * have single-stepped to a sysenter entry point, it explicitly clears the flag
+ * and executes the sys_sysenter routine.
  *
- * One final complication in this final complication is the fact that we
- * have two different entry points for sysenter: brand_sys_sysenter and
- * sys_sysenter.  If we enter at brand_sys_sysenter and start single-stepping
- * through the kernel with kmdb, we will eventually hit the instruction at
- * sys_sysenter.  kmdb cannot distinguish between that valid single-step
- * and the undesirable one mentioned above.  To avoid this situation, we
- * simply add a jump over the instruction at sys_sysenter to make it
- * impossible to single-step to it.
+ * One final complication in this final complication is the fact that we have
+ * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter.
+ * If we enter at brand_sys_sysenter and start single-stepping through the
+ * kernel with kmdb, we will eventually hit the instruction at sys_sysenter.
+ * kmdb cannot distinguish between that valid single-step and the undesirable
+ * one mentioned above.  To avoid this situation, we simply add a jump over the
+ * instruction at sys_sysenter to make it impossible to single-step to it.
  */
 #if defined(__lint)
 
 void
 sys_sysenter()

@@ -962,26 +987,31 @@
 #else   /* __lint */
 
         ENTRY_NP(brand_sys_sysenter)
         SWAPGS                          /* kernel gsbase */
         ALTENTRY(_brand_sys_sysenter_post_swapgs)
+
         BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
         /*
          * Jump over sys_sysenter to allow single-stepping as described
          * above.
          */
         jmp     _sys_sysenter_post_swapgs
 
         ALTENTRY(sys_sysenter)
         SWAPGS                          /* kernel gsbase */
-
         ALTENTRY(_sys_sysenter_post_swapgs)
+
         movq    %gs:CPU_THREAD, %r15
 
         movl    $U32CS_SEL, REGOFF_CS(%rsp)
         movl    %ecx, REGOFF_RSP(%rsp)          /* wrapper: %esp -> %ecx */
         movl    %edx, REGOFF_RIP(%rsp)          /* wrapper: %eip -> %edx */
+        /*
+         * NOTE: none of the instructions that run before we get here should
+         * clobber bits in (R)FLAGS! This includes the kpti trampoline.
+         */
         pushfq
         popq    %r10
         movl    $UDS_SEL, REGOFF_SS(%rsp)
 
         /*

@@ -1019,10 +1049,24 @@
         movq    %rbx, REGOFF_FS(%rsp)
         movw    %gs, %bx
         movq    %rbx, REGOFF_GS(%rsp)
 
         /*
+         * If we're trying to use TRAPTRACE though, I take that back: we're
+         * probably debugging some problem in the SWAPGS logic and want to know
+         * what the incoming gsbase was.
+         *
+         * Since we already did SWAPGS, record the KGSBASE.
+         */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+        movl    $MSR_AMD_KGSBASE, %ecx
+        rdmsr
+        movl    %eax, REGOFF_GSBASE(%rsp)
+        movl    %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+        /*
          * Application state saved in the regs structure on the stack
          * %eax is the syscall number
          * %rsp is the thread's stack, %r15 is curthread
          * REG_RSP(%rsp) is the user's stack
          */

@@ -1116,10 +1160,12 @@
         /*
          * We must protect ourselves from being descheduled here;
          * If we were, and we ended up on another cpu, or another
          * lwp got int ahead of us, it could change the segment
          * registers without us noticing before we return to userland.
+         *
+         * This cli is undone in the tr_sysexit trampoline code.
          */
         cli
         CHECK_POSTSYS_NE(%r15, %r14, %ebx)
         jne     _full_syscall_postsys32
         SIMPLE_SYSCALL_POSTSYS(%r15, %r14, %bx)

@@ -1149,13 +1195,11 @@
         movl    REGOFF_RIP(%rsp), %edx  /* sysexit: %edx -> %eip */
         pushq   REGOFF_RFL(%rsp)
         popfq
         movl    REGOFF_RSP(%rsp), %ecx  /* sysexit: %ecx -> %esp */
         ALTENTRY(sys_sysenter_swapgs_sysexit)
-        swapgs
-        sti
-        sysexit
+        jmp     tr_sysexit
         SET_SIZE(sys_sysenter_swapgs_sysexit)
         SET_SIZE(sys_sysenter)
         SET_SIZE(_sys_sysenter_post_swapgs)
         SET_SIZE(brand_sys_sysenter)
 

@@ -1202,14 +1246,17 @@
         /*
          * There should be no instructions between this label and SWAPGS/IRET
          * or we could end up breaking branded zone support. See the usage of
          * this label in lx_brand_int80_callback and sn1_brand_int91_callback
          * for examples.
+         *
+         * We want to swapgs to maintain the invariant that all entries into
+         * tr_iret_user are done on the user gsbase.
          */
         ALTENTRY(sys_sysint_swapgs_iret)
-        SWAPGS                          /* user gsbase */
-        IRET
+        SWAPGS
+        jmp     tr_iret_user
         /*NOTREACHED*/
         SET_SIZE(sys_sysint_swapgs_iret)
         SET_SIZE(sys_syscall_int)
         SET_SIZE(brand_sys_syscall_int)