Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

*** 20,30 **** */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. ! * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> --- 20,30 ---- */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. ! * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h>
*** 135,145 **** { struct mmuext_op t; uint_t count; if (IN_XPV_PANIC()) { ! mmu_tlbflush_entry((caddr_t)va); } else { t.cmd = MMUEXT_INVLPG_LOCAL; t.arg1.linear_addr = (uintptr_t)va; if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) panic("HYPERVISOR_mmuext_op() failed"); --- 135,145 ---- { struct mmuext_op t; uint_t count; if (IN_XPV_PANIC()) { ! mmu_flush_tlb_page((uintptr_t)va); } else { t.cmd = MMUEXT_INVLPG_LOCAL; t.arg1.linear_addr = (uintptr_t)va; if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) panic("HYPERVISOR_mmuext_op() failed");
*** 152,162 **** { struct mmuext_op t; uint_t count; if (IN_XPV_PANIC()) { ! mmu_tlbflush_entry((caddr_t)va); return; } t.cmd = MMUEXT_INVLPG_MULTI; t.arg1.linear_addr = (uintptr_t)va; --- 152,162 ---- { struct mmuext_op t; uint_t count; if (IN_XPV_PANIC()) { ! mmu_flush_tlb_page((uintptr_t)va); return; } t.cmd = MMUEXT_INVLPG_MULTI; t.arg1.linear_addr = (uintptr_t)va;
*** 619,633 **** * We also skip if HAT_FREEING because hat_pte_unmap() * won't zero out the PTE's. That would lead to hitting * stale PTEs either here or under hat_unload() when we * steal and unload the same page table in competing * threads. */ ! while (hat != NULL && ! (hat->hat_flags & ! (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0) hat = hat->hat_next; if (hat == NULL) break; /* --- 619,637 ---- * We also skip if HAT_FREEING because hat_pte_unmap() * won't zero out the PTE's. That would lead to hitting * stale PTEs either here or under hat_unload() when we * steal and unload the same page table in competing * threads. + * + * We skip HATs that belong to CPUs, to make our lives + * simpler. */ ! while (hat != NULL && (hat->hat_flags & ! (HAT_VICTIM | HAT_SHARED | HAT_FREEING | ! HAT_PCP)) != 0) { hat = hat->hat_next; + } if (hat == NULL) break; /*
*** 666,677 **** for (ht = list; (ht) && (reap); ht = ht->ht_next) { if (ht->ht_hat == NULL) continue; ASSERT(ht->ht_hat == hat); #if defined(__xpv) && defined(__amd64) ! if (!(ht->ht_flags & HTABLE_VLP) && ! ht->ht_level == mmu.max_level) { ptable_free(hat->hat_user_ptable); hat->hat_user_ptable = PFN_INVALID; } #endif /* --- 670,681 ---- for (ht = list; (ht) && (reap); ht = ht->ht_next) { if (ht->ht_hat == NULL) continue; ASSERT(ht->ht_hat == hat); #if defined(__xpv) && defined(__amd64) ! ASSERT(!(ht->ht_flags & HTABLE_COPIED)); ! if (ht->ht_level == mmu.max_level) { ptable_free(hat->hat_user_ptable); hat->hat_user_ptable = PFN_INVALID; } #endif /*
*** 777,796 **** uintptr_t vaddr, level_t level, htable_t *shared) { htable_t *ht = NULL; ! uint_t is_vlp; uint_t is_bare = 0; uint_t need_to_zero = 1; int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP); if (level < 0 || level > TOP_LEVEL(hat)) panic("htable_alloc(): level %d out of range\n", level); ! is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL; ! if (is_vlp || shared != NULL) is_bare = 1; /* * First reuse a cached htable from the hat_ht_cached field, this * avoids unnecessary trips through kmem/page allocators. --- 781,801 ---- uintptr_t vaddr, level_t level, htable_t *shared) { htable_t *ht = NULL; ! uint_t is_copied; uint_t is_bare = 0; uint_t need_to_zero = 1; int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP); if (level < 0 || level > TOP_LEVEL(hat)) panic("htable_alloc(): level %d out of range\n", level); ! is_copied = (hat->hat_flags & HAT_COPIED) && ! level == hat->hat_max_level; ! if (is_copied || shared != NULL) is_bare = 1; /* * First reuse a cached htable from the hat_ht_cached field, this * avoids unnecessary trips through kmem/page allocators.
*** 928,941 **** ht->ht_lock_cnt = 0; ht->ht_valid_cnt = 0; } /* ! * setup flags, etc. for VLP htables */ ! if (is_vlp) { ! ht->ht_flags |= HTABLE_VLP; ASSERT(ht->ht_pfn == PFN_INVALID); need_to_zero = 0; } /* --- 933,946 ---- ht->ht_lock_cnt = 0; ht->ht_valid_cnt = 0; } /* ! * setup flags, etc. for copied page tables. */ ! if (is_copied) { ! ht->ht_flags |= HTABLE_COPIED; ASSERT(ht->ht_pfn == PFN_INVALID); need_to_zero = 0; } /*
*** 982,992 **** */ if (hat != NULL && !(ht->ht_flags & HTABLE_SHARED_PFN) && (use_boot_reserve || (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) { ! ASSERT((ht->ht_flags & HTABLE_VLP) == 0); ASSERT(ht->ht_pfn != PFN_INVALID); hat_enter(hat); ht->ht_next = hat->hat_ht_cached; hat->hat_ht_cached = ht; hat_exit(hat); --- 987,997 ---- */ if (hat != NULL && !(ht->ht_flags & HTABLE_SHARED_PFN) && (use_boot_reserve || (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) { ! ASSERT((ht->ht_flags & HTABLE_COPIED) == 0); ASSERT(ht->ht_pfn != PFN_INVALID); hat_enter(hat); ht->ht_next = hat->hat_ht_cached; hat->hat_ht_cached = ht; hat_exit(hat);
*** 997,1007 **** * If we have a hardware page table, free it. * We don't free page tables that are accessed by sharing. */ if (ht->ht_flags & HTABLE_SHARED_PFN) { ASSERT(ht->ht_pfn != PFN_INVALID); ! } else if (!(ht->ht_flags & HTABLE_VLP)) { ptable_free(ht->ht_pfn); #if defined(__amd64) && defined(__xpv) if (ht->ht_level == mmu.max_level && hat != NULL) { ptable_free(hat->hat_user_ptable); hat->hat_user_ptable = PFN_INVALID; --- 1002,1012 ---- * If we have a hardware page table, free it. * We don't free page tables that are accessed by sharing. */ if (ht->ht_flags & HTABLE_SHARED_PFN) { ASSERT(ht->ht_pfn != PFN_INVALID); ! } else if (!(ht->ht_flags & HTABLE_COPIED)) { ptable_free(ht->ht_pfn); #if defined(__amd64) && defined(__xpv) if (ht->ht_level == mmu.max_level && hat != NULL) { ptable_free(hat->hat_user_ptable); hat->hat_user_ptable = PFN_INVALID;
*** 1109,1127 **** #endif panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE, found, expect); /* ! * When a top level VLP page table entry changes, we must issue ! * a reload of cr3 on all processors. * ! * If we don't need do do that, then we still have to INVLPG against ! * an address covered by the inner page table, as the latest processors * have TLB-like caches for non-leaf page table entries. */ if (!(hat->hat_flags & HAT_FREEING)) { ! hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ? DEMAP_ALL_ADDR : old->ht_vaddr); } HTABLE_DEC(higher->ht_valid_cnt); } --- 1114,1132 ---- #endif panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE, found, expect); /* ! * When a top level PTE changes for a copied htable, we must trigger a ! * hat_pcp_update() on all HAT CPUs. * ! * If we don't need do do that, then we still have to INVLPG against an ! * address covered by the inner page table, as the latest processors * have TLB-like caches for non-leaf page table entries. */ if (!(hat->hat_flags & HAT_FREEING)) { ! hat_tlb_inval(hat, (higher->ht_flags & HTABLE_COPIED) ? DEMAP_ALL_ADDR : old->ht_vaddr); } HTABLE_DEC(higher->ht_valid_cnt); }
*** 1146,1164 **** found = x86pte_cas(higher, entry, 0, newptp); if ((found & ~PT_REF) != 0) panic("HAT: ptp not 0, found=" FMT_PTE, found); /* ! * When any top level VLP page table entry changes, we must issue ! * a reload of cr3 on all processors using it. * We also need to do this for the kernel hat on PAE 32 bit kernel. */ if ( #ifdef __i386 ! (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) || #endif ! (higher->ht_flags & HTABLE_VLP)) hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR); } /* * Release of hold on an htable. If this is the last use and the pagetable --- 1151,1171 ---- found = x86pte_cas(higher, entry, 0, newptp); if ((found & ~PT_REF) != 0) panic("HAT: ptp not 0, found=" FMT_PTE, found); /* ! * When a top level PTE changes for a copied htable, we must trigger a ! * hat_pcp_update() on all HAT CPUs. ! * * We also need to do this for the kernel hat on PAE 32 bit kernel. */ if ( #ifdef __i386 ! (higher->ht_hat == kas.a_hat && ! higher->ht_level == higher->ht_hat->hat_max_level) || #endif ! (higher->ht_flags & HTABLE_COPIED)) hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR); } /* * Release of hold on an htable. If this is the last use and the pagetable
*** 1293,1303 **** #if defined(__amd64) /* * 32 bit address spaces on 64 bit kernels need to check * for overflow of the 32 bit address space */ ! if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32)) return (NULL); #endif base = 0; } else { base = vaddr & LEVEL_MASK(level + 1); --- 1300,1311 ---- #if defined(__amd64) /* * 32 bit address spaces on 64 bit kernels need to check * for overflow of the 32 bit address space */ ! if ((hat->hat_flags & HAT_COPIED_32) && ! vaddr >= ((uint64_t)1 << 32)) return (NULL); #endif base = 0; } else { base = vaddr & LEVEL_MASK(level + 1);
*** 1941,1954 **** */ static x86pte_t * x86pte_access_pagetable(htable_t *ht, uint_t index) { /* ! * VLP pagetables are contained in the hat_t */ ! if (ht->ht_flags & HTABLE_VLP) ! return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index)); return (x86pte_mapin(ht->ht_pfn, index, ht)); } /* * map the given pfn into the page table window. --- 1949,1964 ---- */ static x86pte_t * x86pte_access_pagetable(htable_t *ht, uint_t index) { /* ! * HTABLE_COPIED pagetables are contained in the hat_t */ ! if (ht->ht_flags & HTABLE_COPIED) { ! ASSERT3U(index, <, ht->ht_hat->hat_num_copied); ! return (PT_INDEX_PTR(ht->ht_hat->hat_copied_ptes, index)); ! } return (x86pte_mapin(ht->ht_pfn, index, ht)); } /* * map the given pfn into the page table window.
*** 1977,1987 **** --- 1987,2000 ---- /* * Disable preemption and grab the CPU's hci_mutex */ kpreempt_disable(); + ASSERT(CPU->cpu_hat_info != NULL); + ASSERT(!(getcr4() & CR4_PCIDE)); + mutex_enter(&CPU->cpu_hat_info->hci_mutex); x = PWIN_TABLE(CPU->cpu_id); pteptr = (x86pte_t *)PWIN_PTE_VA(x); #ifndef __xpv if (mmu.pae_hat)
*** 2012,2022 **** if (mmu.pae_hat) *pteptr = newpte; else *(x86pte32_t *)pteptr = newpte; XPV_DISALLOW_PAGETABLE_UPDATES(); ! mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); } } return (PT_INDEX_PTR(PWIN_VA(x), index)); } --- 2025,2035 ---- if (mmu.pae_hat) *pteptr = newpte; else *(x86pte32_t *)pteptr = newpte; XPV_DISALLOW_PAGETABLE_UPDATES(); ! mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x)); } } return (PT_INDEX_PTR(PWIN_VA(x), index)); }
*** 2024,2037 **** * Release access to a page table. */ static void x86pte_release_pagetable(htable_t *ht) { ! /* ! * nothing to do for VLP htables ! */ ! if (ht->ht_flags & HTABLE_VLP) return; x86pte_mapout(); } --- 2037,2047 ---- * Release access to a page table. */ static void x86pte_release_pagetable(htable_t *ht) { ! if (ht->ht_flags & HTABLE_COPIED) return; x86pte_mapout(); }
*** 2128,2138 **** #ifdef __xpv if (!IN_XPV_PANIC()) xen_flush_va((caddr_t)addr); else #endif ! mmu_tlbflush_entry((caddr_t)addr); goto done; } /* * Detect if we have a collision of installing a large --- 2138,2148 ---- #ifdef __xpv if (!IN_XPV_PANIC()) xen_flush_va((caddr_t)addr); else #endif ! mmu_flush_tlb_page(addr); goto done; } /* * Detect if we have a collision of installing a large
*** 2187,2197 **** int cnt = 1; int count; maddr_t ma; if (!IN_XPV_PANIC()) { ! ASSERT(!(ht->ht_flags & HTABLE_VLP)); /* no VLP yet */ ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; t[0].val = new; #if defined(__amd64) --- 2197,2207 ---- int cnt = 1; int count; maddr_t ma; if (!IN_XPV_PANIC()) { ! ASSERT(!(ht->ht_flags & HTABLE_COPIED)); ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; t[0].val = new; #if defined(__amd64)
*** 2344,2354 **** #ifndef __xpv /* * Copy page tables - this is just a little more complicated than the * previous routines. Note that it's also not atomic! It also is never ! * used for VLP pagetables. */ void x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) { caddr_t src_va; --- 2354,2364 ---- #ifndef __xpv /* * Copy page tables - this is just a little more complicated than the * previous routines. Note that it's also not atomic! It also is never ! * used for HTABLE_COPIED pagetables. */ void x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) { caddr_t src_va;
*** 2356,2367 **** size_t size; x86pte_t *pteptr; x86pte_t pte; ASSERT(khat_running); ! ASSERT(!(dest->ht_flags & HTABLE_VLP)); ! ASSERT(!(src->ht_flags & HTABLE_VLP)); ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN)); ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); /* * Acquire access to the CPU pagetable windows for the dest and source. --- 2366,2377 ---- size_t size; x86pte_t *pteptr; x86pte_t pte; ASSERT(khat_running); ! ASSERT(!(dest->ht_flags & HTABLE_COPIED)); ! ASSERT(!(src->ht_flags & HTABLE_COPIED)); ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN)); ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); /* * Acquire access to the CPU pagetable windows for the dest and source.
*** 2371,2380 **** --- 2381,2392 ---- src_va = (caddr_t) PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry); } else { uint_t x = PWIN_SRC(CPU->cpu_id); + ASSERT(!(getcr4() & CR4_PCIDE)); + /* * Finish defining the src pagetable mapping */ src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry); pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
*** 2381,2391 **** pteptr = (x86pte_t *)PWIN_PTE_VA(x); if (mmu.pae_hat) *pteptr = pte; else *(x86pte32_t *)pteptr = pte; ! mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); } /* * now do the copy */ --- 2393,2403 ---- pteptr = (x86pte_t *)PWIN_PTE_VA(x); if (mmu.pae_hat) *pteptr = pte; else *(x86pte32_t *)pteptr = pte; ! mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x)); } /* * now do the copy */
*** 2448,2458 **** /* * Map in the page table to be zeroed. */ ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); ! ASSERT(!(dest->ht_flags & HTABLE_VLP)); /* * On the hypervisor we don't use x86pte_access_pagetable() since * in this case the page is not pinned yet. */ --- 2460,2470 ---- /* * Map in the page table to be zeroed. */ ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); ! ASSERT(!(dest->ht_flags & HTABLE_COPIED)); /* * On the hypervisor we don't use x86pte_access_pagetable() since * in this case the page is not pinned yet. */
*** 2502,2512 **** * Dump all page tables */ for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) { for (h = 0; h < hat->hat_num_hash; ++h) { for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { ! if ((ht->ht_flags & HTABLE_VLP) == 0) dump_page(ht->ht_pfn); } } } } --- 2514,2524 ---- * Dump all page tables */ for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) { for (h = 0; h < hat->hat_num_hash; ++h) { for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { ! if ((ht->ht_flags & HTABLE_COPIED) == 0) dump_page(ht->ht_pfn); } } } }