Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.


  24  */
  25 
  26 #include <sys/t_lock.h>
  27 #include <sys/memlist.h>
  28 #include <sys/cpuvar.h>
  29 #include <sys/vmem.h>
  30 #include <sys/mman.h>
  31 #include <sys/vm.h>
  32 #include <sys/kmem.h>
  33 #include <sys/cmn_err.h>
  34 #include <sys/debug.h>
  35 #include <sys/vm_machparam.h>
  36 #include <sys/tss.h>
  37 #include <sys/vnode.h>
  38 #include <vm/hat.h>
  39 #include <vm/anon.h>
  40 #include <vm/as.h>
  41 #include <vm/page.h>
  42 #include <vm/seg.h>
  43 #include <vm/seg_kmem.h>
  44 #include <vm/seg_map.h>
  45 #include <vm/hat_i86.h>
  46 #include <sys/promif.h>
  47 #include <sys/x86_archext.h>
  48 #include <sys/systm.h>
  49 #include <sys/archsystm.h>
  50 #include <sys/sunddi.h>
  51 #include <sys/ddidmareq.h>
  52 #include <sys/controlregs.h>
  53 #include <sys/reboot.h>
  54 #include <sys/kdi.h>
  55 #include <sys/bootconf.h>
  56 #include <sys/bootsvcs.h>
  57 #include <sys/bootinfo.h>
  58 #include <vm/kboot_mmu.h>
  59 
  60 #ifdef __xpv
  61 #include <sys/hypervisor.h>
  62 #endif
  63 
  64 caddr_t
  65 i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
  66 {
  67         caddr_t addr;
  68         caddr_t addr1;
  69         page_t *pp;
  70 
  71         addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
  72 
  73         for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
  74                 pp = page_numtopp_nolock(pf);
  75                 if (pp == NULL) {
  76                         hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
  77                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
  78                 } else {
  79                         hat_memload(kas.a_hat, addr, pp,
  80                             prot | HAT_NOSYNC, HAT_LOAD_LOCK);
  81                 }
  82         }
  83 
  84         return (addr1);
  85 }
  86 
  87 /*
  88  * This routine is like page_numtopp, but accepts only free pages, which
  89  * it allocates (unfrees) and returns with the exclusive lock held.
  90  * It is used by machdep.c/dma_init() to find contiguous free pages.
  91  *
  92  * XXX this and some others should probably be in vm_machdep.c
  93  */
  94 page_t *
  95 page_numtopp_alloc(pfn_t pfnum)
  96 {
  97         page_t *pp;
  98 
  99 retry:
 100         pp = page_numtopp_nolock(pfnum);
 101         if (pp == NULL) {
 102                 return (NULL);
 103         }
 104 
 105         if (!page_trylock(pp, SE_EXCL)) {
 106                 return (NULL);
 107         }
 108 
 109         if (page_pptonum(pp) != pfnum) {
 110                 page_unlock(pp);
 111                 goto retry;
 112         }
 113 
 114         if (!PP_ISFREE(pp)) {
 115                 page_unlock(pp);
 116                 return (NULL);
 117         }
 118         if (pp->p_szc) {
 119                 page_demote_free_pages(pp);
 120                 page_unlock(pp);
 121                 goto retry;
 122         }
 123 
 124         /* If associated with a vnode, destroy mappings */
 125 
 126         if (pp->p_vnode) {
 127 
 128                 page_destroy_free(pp);
 129 
 130                 if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 131                         return (NULL);
 132                 }
 133 
 134                 if (page_pptonum(pp) != pfnum) {
 135                         page_unlock(pp);
 136                         goto retry;
 137                 }
 138         }
 139 
 140         if (!PP_ISFREE(pp)) {
 141                 page_unlock(pp);
 142                 return (NULL);
 143         }
 144 
 145         if (!page_reclaim(pp, (kmutex_t *)NULL))
 146                 return (NULL);
 147 
 148         return (pp);
 149 }
 150 
 151 /*
 152  * Flag is not set early in boot. Once it is set we are no longer
 153  * using boot's page tables.
 154  */
 155 uint_t khat_running = 0;
 156 
 157 /*
 158  * This procedure is callable only while the boot loader is in charge of the
 159  * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
 160  * kboot_mmu.c since it's used from common code.
 161  */
 162 pfn_t
 163 va_to_pfn(void *vaddr)
 164 {
 165         uintptr_t       des_va = ALIGN2PAGE(vaddr);
 166         uintptr_t       va = des_va;
 167         size_t          len;
 168         uint_t          prot;
 169         pfn_t           pfn;
 170 
 171         if (khat_running)


 419 
 420                 for (l = start_level; l < mmu.max_level; ++l) {
 421                         if (va >> LEVEL_SHIFT(l + 1) ==
 422                             last_va >> LEVEL_SHIFT(l + 1))
 423                                 break;
 424                         ++table_cnt;
 425                 }
 426                 last_va = va;
 427                 l = (start_level == 0) ? 1 : start_level;
 428                 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
 429         }
 430 
 431         /*
 432          * Besides the boot loader mappings, we're going to fill in
 433          * the entire top level page table for the kernel. Make sure there's
 434          * enough reserve for that too.
 435          */
 436         table_cnt += mmu.top_level_count - ((kernelbase >>
 437             LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
 438 
 439 #if defined(__i386)
 440         /*
 441          * The 32 bit PAE hat allocates tables one level below the top when
 442          * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
 443          * a bunch more to the reserve. Any unused will be returned later.
 444          * Note we've already counted these mappings, just not the extra
 445          * pagetables.
 446          */
 447         if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
 448                 table_cnt += mmu.ptes_per_table -
 449                     ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
 450                     LEVEL_SHIFT(mmu.max_level - 1));
 451 #endif
 452 
 453         /*
 454          * Add 1/4 more into table_cnt for extra slop.  The unused
 455          * slop is freed back when we htable_adjust_reserve() later.
 456          */
 457         table_cnt += table_cnt >> 2;
 458 
 459         /*
 460          * We only need mapping entries (hments) for shared pages.
 461          * This should be far, far fewer than the total possible,
 462          * We'll allocate enough for 1/16 of all possible PTEs.
 463          */
 464         mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
 465 
 466         /*
 467          * Now create the initial htable/hment reserves
 468          */
 469         htable_initial_reserve(table_cnt);
 470         hment_reserve(mapping_cnt);
 471         x86pte_cpu_init(CPU);
 472 }
 473 


 476  * This routine handles the work of creating the kernel's initial mappings
 477  * by deciphering the mappings in the page tables created by the boot program.
 478  *
 479  * We maintain large page mappings, but only to a level 1 pagesize.
 480  * The boot loader can only add new mappings once this function starts.
 481  * In particular it can not change the pagesize used for any existing
 482  * mappings or this code breaks!
 483  */
 484 
 485 void
 486 hat_kern_setup(void)
 487 {
 488         /*
 489          * Attach htables to the existing pagetables
 490          */
 491         /* BEGIN CSTYLED */
 492         htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
 493 #ifdef __xpv
 494             mmu_btop(xen_info->pt_base - ONE_GIG));
 495 #else
 496             mmu_btop(getcr3()));
 497 #endif
 498         /* END CSTYLED */
 499 
 500 #if defined(__i386) && !defined(__xpv)
 501         CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
 502 #endif /* __i386 */
 503 
 504 #if defined(__xpv) && defined(__amd64)
 505         /*
 506          * Try to make the kpm mappings r/w. Failures here are OK, as
 507          * it's probably just a pagetable
 508          */
 509         xen_kpm_finish_init();
 510 #endif
 511 
 512         /*
 513          * The kernel HAT is now officially open for business.
 514          */
 515         khat_running = 1;
 516 
 517         CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
 518         CPU->cpu_current_hat = kas.a_hat;
 519 }


















































































































































































   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2018 Joyent, Inc.
  26  */
  27 
  28 #include <sys/t_lock.h>
  29 #include <sys/memlist.h>
  30 #include <sys/cpuvar.h>
  31 #include <sys/vmem.h>
  32 #include <sys/mman.h>
  33 #include <sys/vm.h>
  34 #include <sys/kmem.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/debug.h>
  37 #include <sys/vm_machparam.h>
  38 #include <sys/tss.h>
  39 #include <sys/vnode.h>
  40 #include <vm/hat.h>
  41 #include <vm/anon.h>
  42 #include <vm/as.h>
  43 #include <vm/page.h>
  44 #include <vm/seg.h>
  45 #include <vm/seg_kmem.h>
  46 #include <vm/seg_map.h>
  47 #include <vm/hat_i86.h>
  48 #include <sys/promif.h>
  49 #include <sys/x86_archext.h>
  50 #include <sys/systm.h>
  51 #include <sys/archsystm.h>
  52 #include <sys/sunddi.h>
  53 #include <sys/ddidmareq.h>
  54 #include <sys/controlregs.h>
  55 #include <sys/reboot.h>
  56 #include <sys/kdi.h>
  57 #include <sys/bootconf.h>
  58 #include <sys/bootsvcs.h>
  59 #include <sys/bootinfo.h>
  60 #include <vm/kboot_mmu.h>
  61 
  62 #ifdef __xpv
  63 #include <sys/hypervisor.h>
  64 #endif
  65 
  66 #define ON_USER_HAT(cpu) \
  67         ((cpu)->cpu_m.mcpu_current_hat != NULL && \
  68         (cpu)->cpu_m.mcpu_current_hat != kas.a_hat)



  69 
















  70 /*
































































  71  * Flag is not set early in boot. Once it is set we are no longer
  72  * using boot's page tables.
  73  */
  74 uint_t khat_running = 0;
  75 
  76 /*
  77  * This procedure is callable only while the boot loader is in charge of the
  78  * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
  79  * kboot_mmu.c since it's used from common code.
  80  */
  81 pfn_t
  82 va_to_pfn(void *vaddr)
  83 {
  84         uintptr_t       des_va = ALIGN2PAGE(vaddr);
  85         uintptr_t       va = des_va;
  86         size_t          len;
  87         uint_t          prot;
  88         pfn_t           pfn;
  89 
  90         if (khat_running)


 338 
 339                 for (l = start_level; l < mmu.max_level; ++l) {
 340                         if (va >> LEVEL_SHIFT(l + 1) ==
 341                             last_va >> LEVEL_SHIFT(l + 1))
 342                                 break;
 343                         ++table_cnt;
 344                 }
 345                 last_va = va;
 346                 l = (start_level == 0) ? 1 : start_level;
 347                 va = (va & LEVEL_MASK(l)) + LEVEL_SIZE(l);
 348         }
 349 
 350         /*
 351          * Besides the boot loader mappings, we're going to fill in
 352          * the entire top level page table for the kernel. Make sure there's
 353          * enough reserve for that too.
 354          */
 355         table_cnt += mmu.top_level_count - ((kernelbase >>
 356             LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
 357 

 358         /*













 359          * Add 1/4 more into table_cnt for extra slop.  The unused
 360          * slop is freed back when we htable_adjust_reserve() later.
 361          */
 362         table_cnt += table_cnt >> 2;
 363 
 364         /*
 365          * We only need mapping entries (hments) for shared pages.
 366          * This should be far, far fewer than the total possible,
 367          * We'll allocate enough for 1/16 of all possible PTEs.
 368          */
 369         mapping_cnt = (table_cnt * mmu.ptes_per_table) >> 4;
 370 
 371         /*
 372          * Now create the initial htable/hment reserves
 373          */
 374         htable_initial_reserve(table_cnt);
 375         hment_reserve(mapping_cnt);
 376         x86pte_cpu_init(CPU);
 377 }
 378 


 381  * This routine handles the work of creating the kernel's initial mappings
 382  * by deciphering the mappings in the page tables created by the boot program.
 383  *
 384  * We maintain large page mappings, but only to a level 1 pagesize.
 385  * The boot loader can only add new mappings once this function starts.
 386  * In particular it can not change the pagesize used for any existing
 387  * mappings or this code breaks!
 388  */
 389 
 390 void
 391 hat_kern_setup(void)
 392 {
 393         /*
 394          * Attach htables to the existing pagetables
 395          */
 396         /* BEGIN CSTYLED */
 397         htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
 398 #ifdef __xpv
 399             mmu_btop(xen_info->pt_base - ONE_GIG));
 400 #else
 401             mmu_btop(getcr3_pa()));
 402 #endif
 403         /* END CSTYLED */
 404 
 405 #if defined(__xpv)




 406         /*
 407          * Try to make the kpm mappings r/w. Failures here are OK, as
 408          * it's probably just a pagetable
 409          */
 410         xen_kpm_finish_init();
 411 #endif
 412 
 413         /*
 414          * The kernel HAT is now officially open for business.
 415          */
 416         khat_running = 1;
 417 
 418         CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
 419         CPU->cpu_current_hat = kas.a_hat;
 420 }
 421 
 422 #ifndef __xpv
 423 
 424 /*
 425  * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
 426  * INVPCID_ADDR isn't.
 427  */
 428 static void
 429 invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
 430 {
 431         ulong_t flag;
 432         uint64_t cr4;
 433 
 434         if (x86_use_invpcid == 1) {
 435                 ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
 436                 invpcid_insn(type, pcid, addr);
 437                 return;
 438         }
 439 
 440         switch (type) {
 441         case INVPCID_ALL_GLOBAL:
 442                 flag = intr_clear();
 443                 cr4 = getcr4();
 444                 setcr4(cr4 & ~(ulong_t)CR4_PGE);
 445                 setcr4(cr4 | CR4_PGE);
 446                 intr_restore(flag);
 447                 break;
 448 
 449         case INVPCID_ALL_NONGLOBAL:
 450                 if (!(getcr4() & CR4_PCIDE)) {
 451                         reload_cr3();
 452                 } else {
 453                         flag = intr_clear();
 454                         cr4 = getcr4();
 455                         setcr4(cr4 & ~(ulong_t)CR4_PGE);
 456                         setcr4(cr4 | CR4_PGE);
 457                         intr_restore(flag);
 458                 }
 459                 break;
 460 
 461         case INVPCID_ADDR:
 462                 if (pcid == PCID_USER) {
 463                         flag = intr_clear();
 464                         ASSERT(addr < kernelbase);
 465                         ASSERT(ON_USER_HAT(CPU));
 466                         ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
 467                         tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
 468                             MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
 469                         intr_restore(flag);
 470                 } else {
 471                         mmu_invlpg((caddr_t)addr);
 472                 }
 473                 break;
 474 
 475         default:
 476                 panic("unsupported invpcid(%lu)", type);
 477                 break;
 478         }
 479 }
 480 
 481 /*
 482  * Flush one kernel mapping.
 483  *
 484  * We want to assert on kernel space here mainly for reasoning about the PCIDE
 485  * case: namely, this flush should never need to flush a non-current PCID
 486  * mapping.  This presumes we never have reason to flush the kernel regions
 487  * available to PCID_USER (the trampolines and so on).  It also relies on
 488  * PCID_KERNEL == PCID_NONE.
 489  */
 490 void
 491 mmu_flush_tlb_kpage(uintptr_t va)
 492 {
 493         ASSERT(va >= kernelbase);
 494         ASSERT(getpcid() == PCID_KERNEL);
 495         mmu_invlpg((caddr_t)va);
 496 }
 497 
 498 /*
 499  * Flush one mapping: local CPU version of hat_tlb_inval().
 500  *
 501  * If this is a userspace address in the PCIDE case, we need two invalidations,
 502  * one for any potentially stale PCID_USER mapping, as well as any established
 503  * while in the kernel.
 504  */
 505 void
 506 mmu_flush_tlb_page(uintptr_t va)
 507 {
 508         ASSERT(getpcid() == PCID_KERNEL);
 509 
 510         if (va >= kernelbase) {
 511                 mmu_flush_tlb_kpage(va);
 512                 return;
 513         }
 514 
 515         if (!(getcr4() & CR4_PCIDE)) {
 516                 mmu_invlpg((caddr_t)va);
 517                 return;
 518         }
 519 
 520         /*
 521          * Yes, kas will need to flush below kernelspace, at least during boot.
 522          * But there's no PCID_USER context.
 523          */
 524         if (ON_USER_HAT(CPU))
 525                 invpcid(INVPCID_ADDR, PCID_USER, va);
 526         invpcid(INVPCID_ADDR, PCID_KERNEL, va);
 527 }
 528 
 529 static void
 530 mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
 531 {
 532         EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
 533         ASSERT(len > 0);
 534         ASSERT(pgsz != 0);
 535 
 536         if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
 537                 for (uintptr_t va = addr; va < (addr + len); va += pgsz)
 538                         mmu_flush_tlb_page(va);
 539                 return;
 540         }
 541 
 542         /*
 543          * As an emulated invpcid() in the PCIDE case requires jumping
 544          * cr3s, we batch the invalidations.  We should only need to flush the
 545          * user range if we're on a user-space HAT.
 546          */
 547         if (addr < kernelbase && ON_USER_HAT(CPU)) {
 548                 ulong_t flag = intr_clear();
 549                 ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
 550                 tr_mmu_flush_user_range(addr, len, pgsz,
 551                     CPU->cpu_m.mcpu_kpti.kf_user_cr3);
 552                 intr_restore(flag);
 553         }
 554 
 555         for (uintptr_t va = addr; va < (addr + len); va += pgsz)
 556                 mmu_invlpg((caddr_t)va);
 557 }
 558 
 559 /*
 560  * MMU TLB (and PT cache) flushing on this CPU.
 561  *
 562  * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
 563  * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
 564  * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
 565  * mappings as appropriate.  If using invpcid, PT_GLOBAL mappings are not
 566  * invalidated.
 567  */
 568 void
 569 mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
 570 {
 571         ASSERT(getpcid() == PCID_KERNEL);
 572 
 573         switch (type) {
 574         case FLUSH_TLB_ALL:
 575                 ASSERT(range == NULL);
 576                 invpcid(INVPCID_ALL_GLOBAL, 0, 0);
 577                 break;
 578 
 579         case FLUSH_TLB_NONGLOBAL:
 580                 ASSERT(range == NULL);
 581                 invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
 582                 break;
 583 
 584         case FLUSH_TLB_RANGE: {
 585                 mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
 586                     LEVEL_SIZE(range->tr_level));
 587                 break;
 588         }
 589 
 590         default:
 591                 panic("invalid call mmu_flush_tlb(%d)", type);
 592                 break;
 593         }
 594 }
 595 
 596 #endif /* ! __xpv */