Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

*** 19,28 **** --- 19,30 ---- * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #include <sys/t_lock.h> #include <sys/memlist.h> #include <sys/cpuvar.h>
*** 59,156 **** #ifdef __xpv #include <sys/hypervisor.h> #endif ! caddr_t ! i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot) ! { ! caddr_t addr; ! caddr_t addr1; ! page_t *pp; - addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP); - - for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) { - pp = page_numtopp_nolock(pf); - if (pp == NULL) { - hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf, - prot | HAT_NOSYNC, HAT_LOAD_LOCK); - } else { - hat_memload(kas.a_hat, addr, pp, - prot | HAT_NOSYNC, HAT_LOAD_LOCK); - } - } - - return (addr1); - } - /* - * This routine is like page_numtopp, but accepts only free pages, which - * it allocates (unfrees) and returns with the exclusive lock held. - * It is used by machdep.c/dma_init() to find contiguous free pages. - * - * XXX this and some others should probably be in vm_machdep.c - */ - page_t * - page_numtopp_alloc(pfn_t pfnum) - { - page_t *pp; - - retry: - pp = page_numtopp_nolock(pfnum); - if (pp == NULL) { - return (NULL); - } - - if (!page_trylock(pp, SE_EXCL)) { - return (NULL); - } - - if (page_pptonum(pp) != pfnum) { - page_unlock(pp); - goto retry; - } - - if (!PP_ISFREE(pp)) { - page_unlock(pp); - return (NULL); - } - if (pp->p_szc) { - page_demote_free_pages(pp); - page_unlock(pp); - goto retry; - } - - /* If associated with a vnode, destroy mappings */ - - if (pp->p_vnode) { - - page_destroy_free(pp); - - if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) { - return (NULL); - } - - if (page_pptonum(pp) != pfnum) { - page_unlock(pp); - goto retry; - } - } - - if (!PP_ISFREE(pp)) { - page_unlock(pp); - return (NULL); - } - - if (!page_reclaim(pp, (kmutex_t *)NULL)) - return (NULL); - - return (pp); - } - - /* * Flag is not set early in boot. Once it is set we are no longer * using boot's page tables. */ uint_t khat_running = 0; --- 61,75 ---- #ifdef __xpv #include <sys/hypervisor.h> #endif ! #define ON_USER_HAT(cpu) \ ! ((cpu)->cpu_m.mcpu_current_hat != NULL && \ ! (cpu)->cpu_m.mcpu_current_hat != kas.a_hat) /* * Flag is not set early in boot. Once it is set we are no longer * using boot's page tables. */ uint_t khat_running = 0;
*** 434,458 **** * enough reserve for that too. */ table_cnt += mmu.top_level_count - ((kernelbase >> LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1)); - #if defined(__i386) /* - * The 32 bit PAE hat allocates tables one level below the top when - * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate - * a bunch more to the reserve. Any unused will be returned later. - * Note we've already counted these mappings, just not the extra - * pagetables. - */ - if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0) - table_cnt += mmu.ptes_per_table - - ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >> - LEVEL_SHIFT(mmu.max_level - 1)); - #endif - - /* * Add 1/4 more into table_cnt for extra slop. The unused * slop is freed back when we htable_adjust_reserve() later. */ table_cnt += table_cnt >> 2; --- 353,363 ----
*** 491,509 **** /* BEGIN CSTYLED */ htable_attach(kas.a_hat, 0, mmu.max_level, NULL, #ifdef __xpv mmu_btop(xen_info->pt_base - ONE_GIG)); #else ! mmu_btop(getcr3())); #endif /* END CSTYLED */ ! #if defined(__i386) && !defined(__xpv) ! CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3(); ! #endif /* __i386 */ ! ! #if defined(__xpv) && defined(__amd64) /* * Try to make the kpm mappings r/w. Failures here are OK, as * it's probably just a pagetable */ xen_kpm_finish_init(); --- 396,410 ---- /* BEGIN CSTYLED */ htable_attach(kas.a_hat, 0, mmu.max_level, NULL, #ifdef __xpv mmu_btop(xen_info->pt_base - ONE_GIG)); #else ! mmu_btop(getcr3_pa())); #endif /* END CSTYLED */ ! #if defined(__xpv) /* * Try to make the kpm mappings r/w. Failures here are OK, as * it's probably just a pagetable */ xen_kpm_finish_init();
*** 515,519 **** --- 416,596 ---- khat_running = 1; CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id); CPU->cpu_current_hat = kas.a_hat; } + + #ifndef __xpv + + /* + * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but + * INVPCID_ADDR isn't. + */ + static void + invpcid(uint64_t type, uint64_t pcid, uintptr_t addr) + { + ulong_t flag; + uint64_t cr4; + + if (x86_use_invpcid == 1) { + ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID)); + invpcid_insn(type, pcid, addr); + return; + } + + switch (type) { + case INVPCID_ALL_GLOBAL: + flag = intr_clear(); + cr4 = getcr4(); + setcr4(cr4 & ~(ulong_t)CR4_PGE); + setcr4(cr4 | CR4_PGE); + intr_restore(flag); + break; + + case INVPCID_ALL_NONGLOBAL: + if (!(getcr4() & CR4_PCIDE)) { + reload_cr3(); + } else { + flag = intr_clear(); + cr4 = getcr4(); + setcr4(cr4 & ~(ulong_t)CR4_PGE); + setcr4(cr4 | CR4_PGE); + intr_restore(flag); + } + break; + + case INVPCID_ADDR: + if (pcid == PCID_USER) { + flag = intr_clear(); + ASSERT(addr < kernelbase); + ASSERT(ON_USER_HAT(CPU)); + ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0); + tr_mmu_flush_user_range(addr, MMU_PAGESIZE, + MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3); + intr_restore(flag); + } else { + mmu_invlpg((caddr_t)addr); + } + break; + + default: + panic("unsupported invpcid(%lu)", type); + break; + } + } + + /* + * Flush one kernel mapping. + * + * We want to assert on kernel space here mainly for reasoning about the PCIDE + * case: namely, this flush should never need to flush a non-current PCID + * mapping. This presumes we never have reason to flush the kernel regions + * available to PCID_USER (the trampolines and so on). It also relies on + * PCID_KERNEL == PCID_NONE. + */ + void + mmu_flush_tlb_kpage(uintptr_t va) + { + ASSERT(va >= kernelbase); + ASSERT(getpcid() == PCID_KERNEL); + mmu_invlpg((caddr_t)va); + } + + /* + * Flush one mapping: local CPU version of hat_tlb_inval(). + * + * If this is a userspace address in the PCIDE case, we need two invalidations, + * one for any potentially stale PCID_USER mapping, as well as any established + * while in the kernel. + */ + void + mmu_flush_tlb_page(uintptr_t va) + { + ASSERT(getpcid() == PCID_KERNEL); + + if (va >= kernelbase) { + mmu_flush_tlb_kpage(va); + return; + } + + if (!(getcr4() & CR4_PCIDE)) { + mmu_invlpg((caddr_t)va); + return; + } + + /* + * Yes, kas will need to flush below kernelspace, at least during boot. + * But there's no PCID_USER context. + */ + if (ON_USER_HAT(CPU)) + invpcid(INVPCID_ADDR, PCID_USER, va); + invpcid(INVPCID_ADDR, PCID_KERNEL, va); + } + + static void + mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz) + { + EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase); + ASSERT(len > 0); + ASSERT(pgsz != 0); + + if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) { + for (uintptr_t va = addr; va < (addr + len); va += pgsz) + mmu_flush_tlb_page(va); + return; + } + + /* + * As an emulated invpcid() in the PCIDE case requires jumping + * cr3s, we batch the invalidations. We should only need to flush the + * user range if we're on a user-space HAT. + */ + if (addr < kernelbase && ON_USER_HAT(CPU)) { + ulong_t flag = intr_clear(); + ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0); + tr_mmu_flush_user_range(addr, len, pgsz, + CPU->cpu_m.mcpu_kpti.kf_user_cr3); + intr_restore(flag); + } + + for (uintptr_t va = addr; va < (addr + len); va += pgsz) + mmu_invlpg((caddr_t)va); + } + + /* + * MMU TLB (and PT cache) flushing on this CPU. + * + * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL. + * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL + * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER + * mappings as appropriate. If using invpcid, PT_GLOBAL mappings are not + * invalidated. + */ + void + mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range) + { + ASSERT(getpcid() == PCID_KERNEL); + + switch (type) { + case FLUSH_TLB_ALL: + ASSERT(range == NULL); + invpcid(INVPCID_ALL_GLOBAL, 0, 0); + break; + + case FLUSH_TLB_NONGLOBAL: + ASSERT(range == NULL); + invpcid(INVPCID_ALL_NONGLOBAL, 0, 0); + break; + + case FLUSH_TLB_RANGE: { + mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range), + LEVEL_SIZE(range->tr_level)); + break; + } + + default: + panic("invalid call mmu_flush_tlb(%d)", type); + break; + } + } + + #endif /* ! __xpv */