Print this page
9441 kmdb should stash %cr3 in kdiregs
Reviewed by: John Levon <john.levon@joyent.com>
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

@@ -20,21 +20,22 @@
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
  */
 
-#pragma ident   "%Z%%M% %I%     %E% SMI"
-
 /*
- * Debugger entry for both master and slave CPUs
+ * Debugger entry and exit for both master and slave CPUs. kdi_idthdl.s contains
+ * the IDT stubs that drop into here (mainly via kdi_cmnint).
  */
 
 #if defined(__lint)
 #include <sys/types.h>
-#endif
+#else
 
 #include <sys/segments.h>
 #include <sys/asm_linkage.h>
 #include <sys/controlregs.h>
 #include <sys/x86_archext.h>

@@ -44,13 +45,10 @@
 #include <sys/psw.h>
 #include <sys/uadmin.h>
 #ifdef __xpv
 #include <sys/hypervisor.h>
 #endif
-
-#ifdef _ASM
-
 #include <kdi_assym.h>
 #include <assym.h>
 
 /* clobbers %rdx, %rcx, returns addr in %rax, CPU ID in %rbx */
 #define GET_CPUSAVE_ADDR \

@@ -78,10 +76,13 @@
         movq    %r11, KRS_GDT(%rax);            \
 1:
 
 #ifdef __xpv
 
+/*
+ * Already on kernel gsbase via the hypervisor.
+ */
 #define SAVE_GSBASE(reg) /* nothing */
 #define RESTORE_GSBASE(reg) /* nothing */
 
 #else
 

@@ -88,12 +89,20 @@
 #define SAVE_GSBASE(base)                               \
         movl    $MSR_AMD_GSBASE, %ecx;                  \
         rdmsr;                                          \
         shlq    $32, %rdx;                              \
         orq     %rax, %rdx;                             \
-        movq    %rdx, REG_OFF(KDIREG_GSBASE)(base)
+        movq    %rdx, REG_OFF(KDIREG_GSBASE)(base);     \
+        movl    $MSR_AMD_KGSBASE, %ecx;                 \
+        rdmsr;                                          \
+        shlq    $32, %rdx;                              \
+        orq     %rax, %rdx;                             \
+        movq    %rdx, REG_OFF(KDIREG_KGSBASE)(base)
 
+/*
+ * We shouldn't have stomped on KGSBASE, so don't try to restore it.
+ */
 #define RESTORE_GSBASE(base)                            \
         movq    REG_OFF(KDIREG_GSBASE)(base), %rdx;     \
         movq    %rdx, %rax;                             \
         shrq    $32, %rdx;                              \
         movl    $MSR_AMD_GSBASE, %ecx;                  \

@@ -100,13 +109,11 @@
         wrmsr
 
 #endif /* __xpv */
 
 /*
- * %ss, %rsp, %rflags, %cs, %rip, %err, %trapno are already on the stack.  Note
- * that on the hypervisor, we skip the save/restore of GSBASE: it's slow, and
- * unnecessary.
+ * %ss, %rsp, %rflags, %cs, %rip, %err, %trapno are already on the stack.
  */
 #define KDI_SAVE_REGS(base) \
         movq    %rdi, REG_OFF(KDIREG_RDI)(base);        \
         movq    %rsi, REG_OFF(KDIREG_RSI)(base);        \
         movq    %rdx, REG_OFF(KDIREG_RDX)(base);        \

@@ -123,10 +130,12 @@
         movq    %r14, REG_OFF(KDIREG_R14)(base);        \
         movq    %r15, REG_OFF(KDIREG_R15)(base);        \
         movq    %rbp, REG_OFF(KDIREG_SAVFP)(base);      \
         movq    REG_OFF(KDIREG_RIP)(base), %rax;        \
         movq    %rax, REG_OFF(KDIREG_SAVPC)(base);      \
+        movq    %cr2, %rax;                             \
+        movq    %rax, REG_OFF(KDIREG_CR2)(base);        \
         clrq    %rax;                                   \
         movw    %ds, %ax;                               \
         movq    %rax, REG_OFF(KDIREG_DS)(base);         \
         movw    %es, %ax;                               \
         movq    %rax, REG_OFF(KDIREG_ES)(base);         \

@@ -141,10 +150,12 @@
         RESTORE_GSBASE(%rdi);                           \
         movq    REG_OFF(KDIREG_ES)(%rdi), %rax;         \
         movw    %ax, %es;                               \
         movq    REG_OFF(KDIREG_DS)(%rdi), %rax;         \
         movw    %ax, %ds;                               \
+        movq    REG_OFF(KDIREG_CR2)(base), %rax;        \
+        movq    %rax, %cr2;                             \
         movq    REG_OFF(KDIREG_R15)(%rdi), %r15;        \
         movq    REG_OFF(KDIREG_R14)(%rdi), %r14;        \
         movq    REG_OFF(KDIREG_R13)(%rdi), %r13;        \
         movq    REG_OFF(KDIREG_R12)(%rdi), %r12;        \
         movq    REG_OFF(KDIREG_R11)(%rdi), %r11;        \

@@ -160,18 +171,13 @@
         movq    REG_OFF(KDIREG_RDI)(%rdi), %rdi
 
 /*
  * Given the address of the current CPU's cpusave area in %rax, the following
  * macro restores the debugging state to said CPU.  Restored state includes
- * the debug registers from the global %dr variables, and debugging MSRs from
- * the CPU save area.  This code would be in a separate routine, but for the
- * fact that some of the MSRs are jump-sensitive.  As such, we need to minimize
- * the number of jumps taken subsequent to the update of said MSRs.  We can
- * remove one jump (the ret) by using a macro instead of a function for the
- * debugging state restoration code.
+ * the debug registers from the global %dr variables.
  *
- * Takes the cpusave area in %rdi as a parameter, clobbers %rax-%rdx
+ * Takes the cpusave area in %rdi as a parameter.
  */     
 #define KDI_RESTORE_DEBUGGING_STATE \
         pushq   %rdi;                                           \
         leaq    kdi_drreg(%rip), %r15;                          \
         movl    $7, %edi;                                       \

@@ -192,54 +198,11 @@
         movq    DRADDR_OFF(2)(%r15), %rsi;                      \
         call    kdi_dreg_set;                                   \
         movl    $3, %edi;                                       \
         movq    DRADDR_OFF(3)(%r15), %rsi;                      \
         call    kdi_dreg_set;                                   \
-        popq    %rdi;                                           \
-                                                                \
-        /*                                                      \
-         * Write any requested MSRs.                            \
-         */                                                     \
-        movq    KRS_MSR(%rdi), %rbx;                            \
-        cmpq    $0, %rbx;                                       \
-        je      3f;                                             \
-1:                                                              \
-        movl    MSR_NUM(%rbx), %ecx;                            \
-        cmpl    $0, %ecx;                                       \
-        je      3f;                                             \
-                                                                \
-        movl    MSR_TYPE(%rbx), %edx;                           \
-        cmpl    $KDI_MSR_WRITE, %edx;                           \
-        jne     2f;                                             \
-                                                                \
-        movq    MSR_VALP(%rbx), %rdx;                           \
-        movl    0(%rdx), %eax;                                  \
-        movl    4(%rdx), %edx;                                  \
-        wrmsr;                                                  \
-2:                                                              \
-        addq    $MSR_SIZE, %rbx;                                \
-        jmp     1b;                                             \
-3:                                                              \
-        /*                                                      \
-         * We must not branch after re-enabling LBR.  If        \
-         * kdi_wsr_wrexit_msr is set, it contains the number    \
-         * of the MSR that controls LBR.  kdi_wsr_wrexit_valp   \
-         * contains the value that is to be written to enable   \
-         * LBR.                                                 \
-         */                                                     \
-        leaq    kdi_msr_wrexit_msr(%rip), %rcx;                 \
-        movl    (%rcx), %ecx;                                   \
-        cmpl    $0, %ecx;                                       \
-        je      1f;                                             \
-                                                                \
-        leaq    kdi_msr_wrexit_valp(%rip), %rdx;                \
-        movq    (%rdx), %rdx;                                   \
-        movl    0(%rdx), %eax;                                  \
-        movl    4(%rdx), %edx;                                  \
-                                                                \
-        wrmsr;                                                  \
-1:
+        popq    %rdi;
 
 /*
  * Each cpusave buffer has an area set aside for a ring buffer of breadcrumbs.
  * The following macros manage the buffer.
  */

@@ -268,19 +231,10 @@
 /* Set a value in the current breadcrumb buffer */
 #define ADD_CRUMB(cpusave, offset, value, tmp) \
         movq    KRS_CURCRUMB(cpusave), tmp;     \
         movq    value, offset(tmp)
 
-#endif  /* _ASM */
-
-#if defined(__lint)
-void
-kdi_cmnint(void)
-{
-}
-#else   /* __lint */
-
         /* XXX implement me */
         ENTRY_NP(kdi_nmiint)
         clrq    %rcx
         movq    (%rcx), %rcx
         SET_SIZE(kdi_nmiint)

@@ -326,10 +280,34 @@
 
         movq    %rax, %rdx
         shrq    $32, %rdx
         movl    $MSR_AMD_GSBASE, %ecx
         wrmsr
+
+        /*
+         * In the trampoline we stashed the incoming %cr3. Copy this into
+         * the kdiregs for restoration and later use.
+         */
+        mov     %gs:(CPU_KPTI_DBG+KPTI_TR_CR3), %rdx
+        mov     %rdx, REG_OFF(KDIREG_CR3)(%rsp)
+        /*
+         * Switch to the kernel's %cr3. From the early interrupt handler
+         * until now we've been running on the "paranoid" %cr3 (that of kas
+         * from early in boot).
+         *
+         * If we took the interrupt from somewhere already on the kas/paranoid
+         * %cr3 though, don't change it (this could happen if kcr3 is corrupt
+         * and we took a gptrap earlier from this very code).
+         */
+        cmpq    %rdx, kpti_safe_cr3
+        je      .no_kcr3
+        mov     %gs:CPU_KPTI_KCR3, %rdx
+        cmpq    $0, %rdx
+        je      .no_kcr3
+        mov     %rdx, %cr3
+.no_kcr3:
+
 #endif  /* __xpv */
 
         GET_CPUSAVE_ADDR        /* %rax = cpusave, %rbx = CPU ID */
 
         ADVANCE_CRUMB_POINTER(%rax, %rcx, %rdx)

@@ -349,17 +327,19 @@
          * Were we in the debugger when we took the trap (i.e. was %esp in one
          * of the debugger's memory ranges)?
          */
         leaq    kdi_memranges, %rcx
         movl    kdi_nmemranges, %edx
-1:      cmpq    MR_BASE(%rcx), %rsp
+1:
+        cmpq    MR_BASE(%rcx), %rsp
         jl      2f              /* below this range -- try the next one */
         cmpq    MR_LIM(%rcx), %rsp
         jg      2f              /* above this range -- try the next one */
         jmp     3f              /* matched within this range */
 
-2:      decl    %edx
+2:
+        decl    %edx
         jz      kdi_save_common_state   /* %rsp not within debugger memory */
         addq    $MR_SIZE, %rcx
         jmp     1b
 
 3:      /*

@@ -385,12 +365,10 @@
         jmp     kdi_save_common_state
 
         SET_SIZE(kdi_master_entry)
         SET_SIZE(kdi_cmnint)
 
-#endif  /* __lint */
-
 /*
  * The cross-call handler for slave CPUs.
  *
  * The debugger is single-threaded, so only one CPU, called the master, may be
  * running it at any given time.  The other CPUs, known as slaves, spin in a

@@ -397,26 +375,12 @@
  * busy loop until there's something for them to do.  This is the entry point
  * for the slaves - they'll be sent here in response to a cross-call sent by the
  * master.
  */
 
-#if defined(__lint)
-char kdi_slave_entry_patch;
-
-void
-kdi_slave_entry(void)
-{
-}
-#else /* __lint */
-        .globl  kdi_slave_entry_patch;
-
         ENTRY_NP(kdi_slave_entry)
 
-        /* kdi_msr_add_clrentry knows where this is */
-kdi_slave_entry_patch:
-        KDI_MSR_PATCH;
-
         /*
          * Cross calls are implemented as function calls, so our stack currently
          * looks like one you'd get from a zero-argument function call.  That
          * is, there's the return %rip at %rsp, and that's about it.  We need
          * to make it look like an interrupt stack.  When we first save, we'll

@@ -436,10 +400,13 @@
         pushq   $-1             /* phony trap number */
 
         subq    $REG_OFF(KDIREG_TRAPNO), %rsp
         KDI_SAVE_REGS(%rsp)
 
+        movq    %cr3, %rax
+        movq    %rax, REG_OFF(KDIREG_CR3)(%rsp)
+
         movq    REG_OFF(KDIREG_SS)(%rsp), %rax
         xchgq   REG_OFF(KDIREG_RIP)(%rsp), %rax
         movq    %rax, REG_OFF(KDIREG_SS)(%rsp)
 
         movq    REG_OFF(KDIREG_RSP)(%rsp), %rax

@@ -463,12 +430,10 @@
         pushq   %rax
         jmp     kdi_save_common_state
 
         SET_SIZE(kdi_slave_entry)
 
-#endif  /* __lint */
-
 /*
  * The state of the world:
  *
  * The stack has a complete set of saved registers and segment
  * selectors, arranged in the kdi_regs.h order.  It also has a pointer

@@ -478,12 +443,10 @@
  * registers.  First we check whether we should jump straight back to
  * the kernel.  If not, we save a few more registers, ready the
  * machine for debugger entry, and enter the debugger.
  */
 
-#if !defined(__lint)
-
         ENTRY_NP(kdi_save_common_state)
 
         popq    %rdi                    /* the cpusave area */
         movq    %rsp, KRS_GREGS(%rdi)   /* save ptr to current saved regs */
 

@@ -535,41 +498,10 @@
         call    kdi_dreg_get
         movq    %rax, KRS_DROFF(3)(%r15)
 
         movq    %r15, %rax      /* restore cpu save area to rax */
 
-        /*
-         * Save any requested MSRs.
-         */
-        movq    KRS_MSR(%rax), %rcx
-        cmpq    $0, %rcx
-        je      no_msr
-
-        pushq   %rax            /* rdmsr clobbers %eax */
-        movq    %rcx, %rbx
-
-1:
-        movl    MSR_NUM(%rbx), %ecx
-        cmpl    $0, %ecx
-        je      msr_done
-
-        movl    MSR_TYPE(%rbx), %edx
-        cmpl    $KDI_MSR_READ, %edx
-        jne     msr_next
-
-        rdmsr                   /* addr in %ecx, value into %edx:%eax */
-        movl    %eax, MSR_VAL(%rbx)
-        movl    %edx, _CONST(MSR_VAL + 4)(%rbx)
-
-msr_next:
-        addq    $MSR_SIZE, %rbx
-        jmp     1b
-
-msr_done:
-        popq    %rax
-
-no_msr:
         clrq    %rbp            /* stack traces should end here */
 
         pushq   %rax
         movq    %rax, %rdi      /* cpusave */
 

@@ -580,23 +512,14 @@
 
         jmp     kdi_resume
 
         SET_SIZE(kdi_save_common_state)
 
-#endif  /* !__lint */
-
 /*
  * Resume the world.  The code that calls kdi_resume has already
  * decided whether or not to restore the IDT.
  */
-#if defined(__lint)
-void
-kdi_resume(void)
-{
-}
-#else   /* __lint */
-
         /* cpusave in %rdi */
         ENTRY_NP(kdi_resume)
 
         /*
          * Send this CPU back into the world

@@ -607,20 +530,47 @@
 #endif
 
         KDI_RESTORE_DEBUGGING_STATE
 
         movq    KRS_GREGS(%rdi), %rsp
+
+#if !defined(__xpv)
+        /*
+         * If we're going back via tr_iret_kdi, then we want to copy the
+         * final %cr3 we're going to back into the kpti_dbg area now.
+         *
+         * Since the trampoline needs to find the kpti_dbg too, we enter it
+         * with %r13 set to point at that. The real %r13 (to restore before
+         * the iret) we stash in the kpti_dbg itself.
+         */
+        movq    %gs:CPU_SELF, %r13      /* can't leaq %gs:*, use self-ptr */
+        addq    $CPU_KPTI_DBG, %r13
+
+        movq    REG_OFF(KDIREG_R13)(%rsp), %rdx
+        movq    %rdx, KPTI_R13(%r13)
+
+        movq    REG_OFF(KDIREG_CR3)(%rsp), %rdx
+        movq    %rdx, KPTI_TR_CR3(%r13)
+
+        /* The trampoline will undo this later. */
+        movq    %r13, REG_OFF(KDIREG_R13)(%rsp)
+#endif
+
         KDI_RESTORE_REGS(%rsp)
         addq    $REG_OFF(KDIREG_RIP), %rsp      /* Discard state, trapno, err */
+        /*
+         * The common trampoline code will restore %cr3 to the right value
+         * for either kernel or userland.
+         */
+#if !defined(__xpv)
+        jmp     tr_iret_kdi
+#else
         IRET
+#endif
         /*NOTREACHED*/
         SET_SIZE(kdi_resume)
 
-#endif  /* __lint */
-
-#if !defined(__lint)
-
         ENTRY_NP(kdi_pass_to_kernel)
 
         popq    %rdi /* cpusave */
 
         movq    $KDI_CPU_STATE_NONE, KRS_CPU_STATE(%rdi)

@@ -687,20 +637,10 @@
 #endif
         /*NOTREACHED*/
 
         SET_SIZE(kdi_reboot)
 
-#endif  /* !__lint */
-
-#if defined(__lint)
-/*ARGSUSED*/
-void
-kdi_cpu_debug_init(kdi_cpusave_t *save)
-{
-}
-#else   /* __lint */
-
         ENTRY_NP(kdi_cpu_debug_init)
         pushq   %rbp
         movq    %rsp, %rbp
 
         pushq   %rbx            /* macro will clobber %rbx */

@@ -707,9 +647,34 @@
         KDI_RESTORE_DEBUGGING_STATE
         popq    %rbx
 
         leave
         ret
-
         SET_SIZE(kdi_cpu_debug_init)
-#endif  /* !__lint */
 
+#define GETDREG(name, r)        \
+        ENTRY_NP(name);         \
+        movq    r, %rax;        \
+        ret;                    \
+        SET_SIZE(name)
+
+#define SETDREG(name, r)        \
+        ENTRY_NP(name);         \
+        movq    %rdi, r;        \
+        ret;                    \
+        SET_SIZE(name)
+
+        GETDREG(kdi_getdr0, %dr0)
+        GETDREG(kdi_getdr1, %dr1)
+        GETDREG(kdi_getdr2, %dr2)
+        GETDREG(kdi_getdr3, %dr3)
+        GETDREG(kdi_getdr6, %dr6)
+        GETDREG(kdi_getdr7, %dr7)
+
+        SETDREG(kdi_setdr0, %dr0)
+        SETDREG(kdi_setdr1, %dr1)
+        SETDREG(kdi_setdr2, %dr2)
+        SETDREG(kdi_setdr3, %dr3)
+        SETDREG(kdi_setdr6, %dr6)
+        SETDREG(kdi_setdr7, %dr7)
+
+#endif /* !__lint */