Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

@@ -20,11 +20,11 @@
  */
 
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2014 by Delphix. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>

@@ -135,11 +135,11 @@
 {
         struct mmuext_op t;
         uint_t count;
 
         if (IN_XPV_PANIC()) {
-                mmu_tlbflush_entry((caddr_t)va);
+                mmu_flush_tlb_page((uintptr_t)va);
         } else {
                 t.cmd = MMUEXT_INVLPG_LOCAL;
                 t.arg1.linear_addr = (uintptr_t)va;
                 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
                         panic("HYPERVISOR_mmuext_op() failed");

@@ -152,11 +152,11 @@
 {
         struct mmuext_op t;
         uint_t count;
 
         if (IN_XPV_PANIC()) {
-                mmu_tlbflush_entry((caddr_t)va);
+                mmu_flush_tlb_page((uintptr_t)va);
                 return;
         }
 
         t.cmd = MMUEXT_INVLPG_MULTI;
         t.arg1.linear_addr = (uintptr_t)va;

@@ -619,15 +619,19 @@
                          * We also skip if HAT_FREEING because hat_pte_unmap()
                          * won't zero out the PTE's. That would lead to hitting
                          * stale PTEs either here or under hat_unload() when we
                          * steal and unload the same page table in competing
                          * threads.
+                         *
+                         * We skip HATs that belong to CPUs, to make our lives
+                         * simpler.
                          */
-                        while (hat != NULL &&
-                            (hat->hat_flags &
-                            (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
+                        while (hat != NULL && (hat->hat_flags &
+                            (HAT_VICTIM | HAT_SHARED | HAT_FREEING |
+                            HAT_PCP)) != 0) {
                                 hat = hat->hat_next;
+                        }
 
                         if (hat == NULL)
                                 break;
 
                         /*

@@ -666,12 +670,12 @@
                         for (ht = list; (ht) && (reap); ht = ht->ht_next) {
                                 if (ht->ht_hat == NULL)
                                         continue;
                                 ASSERT(ht->ht_hat == hat);
 #if defined(__xpv) && defined(__amd64)
-                                if (!(ht->ht_flags & HTABLE_VLP) &&
-                                    ht->ht_level == mmu.max_level) {
+                                ASSERT(!(ht->ht_flags & HTABLE_COPIED));
+                                if (ht->ht_level == mmu.max_level) {
                                         ptable_free(hat->hat_user_ptable);
                                         hat->hat_user_ptable = PFN_INVALID;
                                 }
 #endif
                                 /*

@@ -777,20 +781,21 @@
         uintptr_t       vaddr,
         level_t         level,
         htable_t        *shared)
 {
         htable_t        *ht = NULL;
-        uint_t          is_vlp;
+        uint_t          is_copied;
         uint_t          is_bare = 0;
         uint_t          need_to_zero = 1;
         int             kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
 
         if (level < 0 || level > TOP_LEVEL(hat))
                 panic("htable_alloc(): level %d out of range\n", level);
 
-        is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
-        if (is_vlp || shared != NULL)
+        is_copied = (hat->hat_flags & HAT_COPIED) &&
+            level == hat->hat_max_level;
+        if (is_copied || shared != NULL)
                 is_bare = 1;
 
         /*
          * First reuse a cached htable from the hat_ht_cached field, this
          * avoids unnecessary trips through kmem/page allocators.

@@ -928,14 +933,14 @@
                 ht->ht_lock_cnt = 0;
                 ht->ht_valid_cnt = 0;
         }
 
         /*
-         * setup flags, etc. for VLP htables
+         * setup flags, etc. for copied page tables.
          */
-        if (is_vlp) {
-                ht->ht_flags |= HTABLE_VLP;
+        if (is_copied) {
+                ht->ht_flags |= HTABLE_COPIED;
                 ASSERT(ht->ht_pfn == PFN_INVALID);
                 need_to_zero = 0;
         }
 
         /*

@@ -982,11 +987,11 @@
          */
         if (hat != NULL &&
             !(ht->ht_flags & HTABLE_SHARED_PFN) &&
             (use_boot_reserve ||
             (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
-                ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
+                ASSERT((ht->ht_flags & HTABLE_COPIED) == 0);
                 ASSERT(ht->ht_pfn != PFN_INVALID);
                 hat_enter(hat);
                 ht->ht_next = hat->hat_ht_cached;
                 hat->hat_ht_cached = ht;
                 hat_exit(hat);

@@ -997,11 +1002,11 @@
          * If we have a hardware page table, free it.
          * We don't free page tables that are accessed by sharing.
          */
         if (ht->ht_flags & HTABLE_SHARED_PFN) {
                 ASSERT(ht->ht_pfn != PFN_INVALID);
-        } else if (!(ht->ht_flags & HTABLE_VLP)) {
+        } else if (!(ht->ht_flags & HTABLE_COPIED)) {
                 ptable_free(ht->ht_pfn);
 #if defined(__amd64) && defined(__xpv)
                 if (ht->ht_level == mmu.max_level && hat != NULL) {
                         ptable_free(hat->hat_user_ptable);
                         hat->hat_user_ptable = PFN_INVALID;

@@ -1109,19 +1114,19 @@
 #endif
                 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
                     found, expect);
 
         /*
-         * When a top level VLP page table entry changes, we must issue
-         * a reload of cr3 on all processors.
+         * When a top level PTE changes for a copied htable, we must trigger a
+         * hat_pcp_update() on all HAT CPUs.
          *
-         * If we don't need do do that, then we still have to INVLPG against
-         * an address covered by the inner page table, as the latest processors
+         * If we don't need do do that, then we still have to INVLPG against an
+         * address covered by the inner page table, as the latest processors
          * have TLB-like caches for non-leaf page table entries.
          */
         if (!(hat->hat_flags & HAT_FREEING)) {
-                hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
+                hat_tlb_inval(hat, (higher->ht_flags & HTABLE_COPIED) ?
                     DEMAP_ALL_ADDR : old->ht_vaddr);
         }
 
         HTABLE_DEC(higher->ht_valid_cnt);
 }

@@ -1146,19 +1151,21 @@
         found = x86pte_cas(higher, entry, 0, newptp);
         if ((found & ~PT_REF) != 0)
                 panic("HAT: ptp not 0, found=" FMT_PTE, found);
 
         /*
-         * When any top level VLP page table entry changes, we must issue
-         * a reload of cr3 on all processors using it.
+         * When a top level PTE changes for a copied htable, we must trigger a
+         * hat_pcp_update() on all HAT CPUs.
+         *
          * We also need to do this for the kernel hat on PAE 32 bit kernel.
          */
         if (
 #ifdef __i386
-            (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||
+            (higher->ht_hat == kas.a_hat &&
+            higher->ht_level == higher->ht_hat->hat_max_level) ||
 #endif
-            (higher->ht_flags & HTABLE_VLP))
+            (higher->ht_flags & HTABLE_COPIED))
                 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
 }
 
 /*
  * Release of hold on an htable. If this is the last use and the pagetable

@@ -1293,11 +1300,12 @@
 #if defined(__amd64)
                 /*
                  * 32 bit address spaces on 64 bit kernels need to check
                  * for overflow of the 32 bit address space
                  */
-                if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))
+                if ((hat->hat_flags & HAT_COPIED_32) &&
+                    vaddr >= ((uint64_t)1 << 32))
                         return (NULL);
 #endif
                 base = 0;
         } else {
                 base = vaddr & LEVEL_MASK(level + 1);

@@ -1941,14 +1949,16 @@
  */
 static x86pte_t *
 x86pte_access_pagetable(htable_t *ht, uint_t index)
 {
         /*
-         * VLP pagetables are contained in the hat_t
+         * HTABLE_COPIED pagetables are contained in the hat_t
          */
-        if (ht->ht_flags & HTABLE_VLP)
-                return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));
+        if (ht->ht_flags & HTABLE_COPIED) {
+                ASSERT3U(index, <, ht->ht_hat->hat_num_copied);
+                return (PT_INDEX_PTR(ht->ht_hat->hat_copied_ptes, index));
+        }
         return (x86pte_mapin(ht->ht_pfn, index, ht));
 }
 
 /*
  * map the given pfn into the page table window.

@@ -1977,11 +1987,14 @@
 
         /*
          * Disable preemption and grab the CPU's hci_mutex
          */
         kpreempt_disable();
+
         ASSERT(CPU->cpu_hat_info != NULL);
+        ASSERT(!(getcr4() & CR4_PCIDE));
+
         mutex_enter(&CPU->cpu_hat_info->hci_mutex);
         x = PWIN_TABLE(CPU->cpu_id);
         pteptr = (x86pte_t *)PWIN_PTE_VA(x);
 #ifndef __xpv
         if (mmu.pae_hat)

@@ -2012,11 +2025,11 @@
                         if (mmu.pae_hat)
                                 *pteptr = newpte;
                         else
                                 *(x86pte32_t *)pteptr = newpte;
                         XPV_DISALLOW_PAGETABLE_UPDATES();
-                        mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
+                        mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
                 }
         }
         return (PT_INDEX_PTR(PWIN_VA(x), index));
 }
 

@@ -2024,14 +2037,11 @@
  * Release access to a page table.
  */
 static void
 x86pte_release_pagetable(htable_t *ht)
 {
-        /*
-         * nothing to do for VLP htables
-         */
-        if (ht->ht_flags & HTABLE_VLP)
+        if (ht->ht_flags & HTABLE_COPIED)
                 return;
 
         x86pte_mapout();
 }
 

@@ -2128,11 +2138,11 @@
 #ifdef __xpv
                         if (!IN_XPV_PANIC())
                                 xen_flush_va((caddr_t)addr);
                         else
 #endif
-                                mmu_tlbflush_entry((caddr_t)addr);
+                                mmu_flush_tlb_page(addr);
                         goto done;
                 }
 
                 /*
                  * Detect if we have a collision of installing a large

@@ -2187,11 +2197,11 @@
         int cnt = 1;
         int count;
         maddr_t ma;
 
         if (!IN_XPV_PANIC()) {
-                ASSERT(!(ht->ht_flags & HTABLE_VLP));   /* no VLP yet */
+                ASSERT(!(ht->ht_flags & HTABLE_COPIED));
                 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
                 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
                 t[0].val = new;
 
 #if defined(__amd64)

@@ -2344,11 +2354,11 @@
 
 #ifndef __xpv
 /*
  * Copy page tables - this is just a little more complicated than the
  * previous routines. Note that it's also not atomic! It also is never
- * used for VLP pagetables.
+ * used for HTABLE_COPIED pagetables.
  */
 void
 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
 {
         caddr_t src_va;

@@ -2356,12 +2366,12 @@
         size_t size;
         x86pte_t *pteptr;
         x86pte_t pte;
 
         ASSERT(khat_running);
-        ASSERT(!(dest->ht_flags & HTABLE_VLP));
-        ASSERT(!(src->ht_flags & HTABLE_VLP));
+        ASSERT(!(dest->ht_flags & HTABLE_COPIED));
+        ASSERT(!(src->ht_flags & HTABLE_COPIED));
         ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
         ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
 
         /*
          * Acquire access to the CPU pagetable windows for the dest and source.

@@ -2371,10 +2381,12 @@
                 src_va = (caddr_t)
                     PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
         } else {
                 uint_t x = PWIN_SRC(CPU->cpu_id);
 
+                ASSERT(!(getcr4() & CR4_PCIDE));
+
                 /*
                  * Finish defining the src pagetable mapping
                  */
                 src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
                 pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;

@@ -2381,11 +2393,11 @@
                 pteptr = (x86pte_t *)PWIN_PTE_VA(x);
                 if (mmu.pae_hat)
                         *pteptr = pte;
                 else
                         *(x86pte32_t *)pteptr = pte;
-                mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
+                mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
         }
 
         /*
          * now do the copy
          */

@@ -2448,11 +2460,11 @@
 
         /*
          * Map in the page table to be zeroed.
          */
         ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
-        ASSERT(!(dest->ht_flags & HTABLE_VLP));
+        ASSERT(!(dest->ht_flags & HTABLE_COPIED));
 
         /*
          * On the hypervisor we don't use x86pte_access_pagetable() since
          * in this case the page is not pinned yet.
          */

@@ -2502,11 +2514,11 @@
          * Dump all page tables
          */
         for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
                 for (h = 0; h < hat->hat_num_hash; ++h) {
                         for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
-                                if ((ht->ht_flags & HTABLE_VLP) == 0)
+                                if ((ht->ht_flags & HTABLE_COPIED) == 0)
                                         dump_page(ht->ht_pfn);
                         }
                 }
         }
 }