Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

@@ -24,11 +24,11 @@
 /*
  * Copyright (c) 2010, Intel Corporation.
  * All rights reserved.
  */
 /*
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc
  */
 
 /*
  * Welcome to the world of the "real mode platter".
  * See also startup.c, mpcore.s and apic.c for related routines.

@@ -131,14 +131,15 @@
 rmp_gdt_init(rm_platter_t *rm)
 {
 
 #if defined(__amd64)
         /* Use the kas address space for the CPU startup thread. */
-        if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL)
+        if (mmu_ptob(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) {
                 panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
                     "located above 4G in physical memory (@ 0x%lx)",
-                    MAKECR3(kas.a_hat->hat_htable->ht_pfn));
+                    mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
+        }
 
         /*
          * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
          * by code in real_mode_start_cpu():
          *

@@ -172,34 +173,82 @@
 static void *
 mach_cpucontext_alloc_tables(struct cpu *cp)
 {
         tss_t *ntss;
         struct cpu_tables *ct;
+        size_t ctsize;
 
         /*
          * Allocate space for stack, tss, gdt and idt. We round the size
          * allotted for cpu_tables up, so that the TSS is on a unique page.
          * This is more efficient when running in virtual machines.
          */
-        ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP);
+        ctsize = P2ROUNDUP(sizeof (*ct), PAGESIZE);
+        ct = kmem_zalloc(ctsize, KM_SLEEP);
         if ((uintptr_t)ct & PAGEOFFSET)
                 panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables",
                     cp->cpu_id);
 
         ntss = cp->cpu_tss = &ct->ct_tss;
 
 #if defined(__amd64)
+        uintptr_t va;
+        size_t len;
 
         /*
          * #DF (double fault).
          */
-        ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)];
+        ntss->tss_ist1 = (uintptr_t)&ct->ct_stack1[sizeof (ct->ct_stack1)];
 
+        /*
+         * #NM (non-maskable interrupt)
+         */
+        ntss->tss_ist2 = (uintptr_t)&ct->ct_stack2[sizeof (ct->ct_stack2)];
+
+        /*
+         * #MC (machine check exception / hardware error)
+         */
+        ntss->tss_ist3 = (uintptr_t)&ct->ct_stack3[sizeof (ct->ct_stack3)];
+
+        /*
+         * #DB, #BP debug interrupts and KDI/kmdb
+         */
+        ntss->tss_ist4 = (uintptr_t)&cp->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
+
+        if (kpti_enable == 1) {
+                /*
+                 * #GP, #PF, #SS fault interrupts
+                 */
+                ntss->tss_ist5 = (uintptr_t)&cp->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
+
+                /*
+                 * Used by all other interrupts
+                 */
+                ntss->tss_ist6 = (uint64_t)&cp->cpu_m.mcpu_kpti.kf_tr_rsp;
+
+                /*
+                 * On AMD64 we need to make sure that all of the pages of the
+                 * struct cpu_tables are punched through onto the user CPU for
+                 * kpti.
+                 *
+                 * The final page will always be the TSS, so treat that
+                 * separately.
+                 */
+                for (va = (uintptr_t)ct, len = ctsize - MMU_PAGESIZE;
+                    len >= MMU_PAGESIZE;
+                    len -= MMU_PAGESIZE, va += MMU_PAGESIZE) {
+                        /* The doublefault stack must be RW */
+                        hati_cpu_punchin(cp, va, PROT_READ | PROT_WRITE);
+                }
+                ASSERT3U((uintptr_t)ntss, ==, va);
+                hati_cpu_punchin(cp, (uintptr_t)ntss, PROT_READ);
+        }
+
 #elif defined(__i386)
 
         ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp =
-            (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)];
+            (uint32_t)&ct->ct_stack1[sizeof (ct->ct_stack1)];
 
         ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL;
 
         ntss->tss_eip = (uint32_t)cp->cpu_thread->t_pc;
 

@@ -306,25 +355,21 @@
         rm->rm_gdt_base = cp->cpu_gdt;
         rm->rm_gdt_lim = sizeof (*cp->cpu_gdt) * NGDT - 1;
 
         /*
          * CPU needs to access kernel address space after powering on.
-         * When hot-adding CPU at runtime, directly use top level page table
-         * of kas other than the return value of getcr3(). getcr3() returns
-         * current process's top level page table, which may be different from
-         * the one of kas.
          */
-        rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn);
+        rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_NONE);
         rm->rm_cpu = cp->cpu_id;
 
         /*
-         * For hot-adding CPU at runtime, Machine Check and Performance Counter
-         * should be disabled. They will be enabled on demand after CPU powers
-         * on successfully
+         * We need to mask off any bits set on our boot CPU that can't apply
+         * while the subject CPU is initializing.  If appropriate, they are
+         * enabled later on.
          */
         rm->rm_cr4 = getcr4();
-        rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE);
+        rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE | CR4_PCIDE);
 
         rmp_gdt_init(rm);
 
         return (ct);
 }