Print this page
8956 Implement KPTI
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/i86pc/vm/i86_mmu.c
          +++ new/usr/src/uts/i86pc/vm/i86_mmu.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
       24 + *
       25 + * Copyright 2018 Joyent, Inc.
  24   26   */
  25   27  
  26   28  #include <sys/t_lock.h>
  27   29  #include <sys/memlist.h>
  28   30  #include <sys/cpuvar.h>
  29   31  #include <sys/vmem.h>
  30   32  #include <sys/mman.h>
  31   33  #include <sys/vm.h>
  32   34  #include <sys/kmem.h>
  33   35  #include <sys/cmn_err.h>
↓ open down ↓ 20 lines elided ↑ open up ↑
  54   56  #include <sys/kdi.h>
  55   57  #include <sys/bootconf.h>
  56   58  #include <sys/bootsvcs.h>
  57   59  #include <sys/bootinfo.h>
  58   60  #include <vm/kboot_mmu.h>
  59   61  
  60   62  #ifdef __xpv
  61   63  #include <sys/hypervisor.h>
  62   64  #endif
  63   65  
  64      -caddr_t
  65      -i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
  66      -{
  67      -        caddr_t addr;
  68      -        caddr_t addr1;
  69      -        page_t *pp;
       66 +#define ON_USER_HAT(cpu) \
       67 +        ((cpu)->cpu_m.mcpu_current_hat != NULL && \
       68 +        (cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
  70   69  
  71      -        addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
  72      -
  73      -        for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
  74      -                pp = page_numtopp_nolock(pf);
  75      -                if (pp == NULL) {
  76      -                        hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
  77      -                            prot | HAT_NOSYNC, HAT_LOAD_LOCK);
  78      -                } else {
  79      -                        hat_memload(kas.a_hat, addr, pp,
  80      -                            prot | HAT_NOSYNC, HAT_LOAD_LOCK);
  81      -                }
  82      -        }
  83      -
  84      -        return (addr1);
  85      -}
  86      -
  87   70  /*
  88      - * This routine is like page_numtopp, but accepts only free pages, which
  89      - * it allocates (unfrees) and returns with the exclusive lock held.
  90      - * It is used by machdep.c/dma_init() to find contiguous free pages.
  91      - *
  92      - * XXX this and some others should probably be in vm_machdep.c
  93      - */
  94      -page_t *
  95      -page_numtopp_alloc(pfn_t pfnum)
  96      -{
  97      -        page_t *pp;
  98      -
  99      -retry:
 100      -        pp = page_numtopp_nolock(pfnum);
 101      -        if (pp == NULL) {
 102      -                return (NULL);
 103      -        }
 104      -
 105      -        if (!page_trylock(pp, SE_EXCL)) {
 106      -                return (NULL);
 107      -        }
 108      -
 109      -        if (page_pptonum(pp) != pfnum) {
 110      -                page_unlock(pp);
 111      -                goto retry;
 112      -        }
 113      -
 114      -        if (!PP_ISFREE(pp)) {
 115      -                page_unlock(pp);
 116      -                return (NULL);
 117      -        }
 118      -        if (pp->p_szc) {
 119      -                page_demote_free_pages(pp);
 120      -                page_unlock(pp);
 121      -                goto retry;
 122      -        }
 123      -
 124      -        /* If associated with a vnode, destroy mappings */
 125      -
 126      -        if (pp->p_vnode) {
 127      -
 128      -                page_destroy_free(pp);
 129      -
 130      -                if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
 131      -                        return (NULL);
 132      -                }
 133      -
 134      -                if (page_pptonum(pp) != pfnum) {
 135      -                        page_unlock(pp);
 136      -                        goto retry;
 137      -                }
 138      -        }
 139      -
 140      -        if (!PP_ISFREE(pp)) {
 141      -                page_unlock(pp);
 142      -                return (NULL);
 143      -        }
 144      -
 145      -        if (!page_reclaim(pp, (kmutex_t *)NULL))
 146      -                return (NULL);
 147      -
 148      -        return (pp);
 149      -}
 150      -
 151      -/*
 152   71   * Flag is not set early in boot. Once it is set we are no longer
 153   72   * using boot's page tables.
 154   73   */
 155   74  uint_t khat_running = 0;
 156   75  
 157   76  /*
 158   77   * This procedure is callable only while the boot loader is in charge of the
 159   78   * MMU. It assumes that PA == VA for page table pointers.  It doesn't live in
 160   79   * kboot_mmu.c since it's used from common code.
 161   80   */
↓ open down ↓ 267 lines elided ↑ open up ↑
 429  348          }
 430  349  
 431  350          /*
 432  351           * Besides the boot loader mappings, we're going to fill in
 433  352           * the entire top level page table for the kernel. Make sure there's
 434  353           * enough reserve for that too.
 435  354           */
 436  355          table_cnt += mmu.top_level_count - ((kernelbase >>
 437  356              LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
 438  357  
 439      -#if defined(__i386)
 440  358          /*
 441      -         * The 32 bit PAE hat allocates tables one level below the top when
 442      -         * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
 443      -         * a bunch more to the reserve. Any unused will be returned later.
 444      -         * Note we've already counted these mappings, just not the extra
 445      -         * pagetables.
 446      -         */
 447      -        if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
 448      -                table_cnt += mmu.ptes_per_table -
 449      -                    ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
 450      -                    LEVEL_SHIFT(mmu.max_level - 1));
 451      -#endif
 452      -
 453      -        /*
 454  359           * Add 1/4 more into table_cnt for extra slop.  The unused
 455  360           * slop is freed back when we htable_adjust_reserve() later.
 456  361           */
 457  362          table_cnt += table_cnt >> 2;
 458  363  
 459  364          /*
 460  365           * We only need mapping entries (hments) for shared pages.
 461  366           * This should be far, far fewer than the total possible,
 462  367           * We'll allocate enough for 1/16 of all possible PTEs.
 463  368           */
↓ open down ↓ 22 lines elided ↑ open up ↑
 486  391  hat_kern_setup(void)
 487  392  {
 488  393          /*
 489  394           * Attach htables to the existing pagetables
 490  395           */
 491  396          /* BEGIN CSTYLED */
 492  397          htable_attach(kas.a_hat, 0, mmu.max_level, NULL,
 493  398  #ifdef __xpv
 494  399              mmu_btop(xen_info->pt_base - ONE_GIG));
 495  400  #else
 496      -            mmu_btop(getcr3()));
      401 +            mmu_btop(getcr3_pa()));
 497  402  #endif
 498  403          /* END CSTYLED */
 499  404  
 500      -#if defined(__i386) && !defined(__xpv)
 501      -        CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
 502      -#endif /* __i386 */
 503      -
 504      -#if defined(__xpv) && defined(__amd64)
      405 +#if defined(__xpv)
 505  406          /*
 506  407           * Try to make the kpm mappings r/w. Failures here are OK, as
 507  408           * it's probably just a pagetable
 508  409           */
 509  410          xen_kpm_finish_init();
 510  411  #endif
 511  412  
 512  413          /*
 513  414           * The kernel HAT is now officially open for business.
 514  415           */
 515  416          khat_running = 1;
 516  417  
 517  418          CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
 518  419          CPU->cpu_current_hat = kas.a_hat;
 519  420  }
      421 +
      422 +#ifndef __xpv
      423 +
      424 +/*
      425 + * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
      426 + * INVPCID_ADDR isn't.
      427 + */
      428 +static void
      429 +invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
      430 +{
      431 +        ulong_t flag;
      432 +        uint64_t cr4;
      433 +
      434 +        if (x86_use_invpcid == 1) {
      435 +                ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
      436 +                invpcid_insn(type, pcid, addr);
      437 +                return;
      438 +        }
      439 +
      440 +        switch (type) {
      441 +        case INVPCID_ALL_GLOBAL:
      442 +                flag = intr_clear();
      443 +                cr4 = getcr4();
      444 +                setcr4(cr4 & ~(ulong_t)CR4_PGE);
      445 +                setcr4(cr4 | CR4_PGE);
      446 +                intr_restore(flag);
      447 +                break;
      448 +
      449 +        case INVPCID_ALL_NONGLOBAL:
      450 +                if (!(getcr4() & CR4_PCIDE)) {
      451 +                        reload_cr3();
      452 +                } else {
      453 +                        flag = intr_clear();
      454 +                        cr4 = getcr4();
      455 +                        setcr4(cr4 & ~(ulong_t)CR4_PGE);
      456 +                        setcr4(cr4 | CR4_PGE);
      457 +                        intr_restore(flag);
      458 +                }
      459 +                break;
      460 +
      461 +        case INVPCID_ADDR:
      462 +                if (pcid == PCID_USER) {
      463 +                        flag = intr_clear();
      464 +                        ASSERT(addr < kernelbase);
      465 +                        ASSERT(ON_USER_HAT(CPU));
      466 +                        ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
      467 +                        tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
      468 +                            MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
      469 +                        intr_restore(flag);
      470 +                } else {
      471 +                        mmu_invlpg((caddr_t)addr);
      472 +                }
      473 +                break;
      474 +
      475 +        default:
      476 +                panic("unsupported invpcid(%lu)", type);
      477 +                break;
      478 +        }
      479 +}
      480 +
      481 +/*
      482 + * Flush one kernel mapping.
      483 + *
      484 + * We want to assert on kernel space here mainly for reasoning about the PCIDE
      485 + * case: namely, this flush should never need to flush a non-current PCID
      486 + * mapping.  This presumes we never have reason to flush the kernel regions
      487 + * available to PCID_USER (the trampolines and so on).  It also relies on
      488 + * PCID_KERNEL == PCID_NONE.
      489 + */
      490 +void
      491 +mmu_flush_tlb_kpage(uintptr_t va)
      492 +{
      493 +        ASSERT(va >= kernelbase);
      494 +        ASSERT(getpcid() == PCID_KERNEL);
      495 +        mmu_invlpg((caddr_t)va);
      496 +}
      497 +
      498 +/*
      499 + * Flush one mapping: local CPU version of hat_tlb_inval().
      500 + *
      501 + * If this is a userspace address in the PCIDE case, we need two invalidations,
      502 + * one for any potentially stale PCID_USER mapping, as well as any established
      503 + * while in the kernel.
      504 + */
      505 +void
      506 +mmu_flush_tlb_page(uintptr_t va)
      507 +{
      508 +        ASSERT(getpcid() == PCID_KERNEL);
      509 +
      510 +        if (va >= kernelbase) {
      511 +                mmu_flush_tlb_kpage(va);
      512 +                return;
      513 +        }
      514 +
      515 +        if (!(getcr4() & CR4_PCIDE)) {
      516 +                mmu_invlpg((caddr_t)va);
      517 +                return;
      518 +        }
      519 +
      520 +        /*
      521 +         * Yes, kas will need to flush below kernelspace, at least during boot.
      522 +         * But there's no PCID_USER context.
      523 +         */
      524 +        if (ON_USER_HAT(CPU))
      525 +                invpcid(INVPCID_ADDR, PCID_USER, va);
      526 +        invpcid(INVPCID_ADDR, PCID_KERNEL, va);
      527 +}
      528 +
      529 +static void
      530 +mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
      531 +{
      532 +        EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
      533 +        ASSERT(len > 0);
      534 +        ASSERT(pgsz != 0);
      535 +
      536 +        if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
      537 +                for (uintptr_t va = addr; va < (addr + len); va += pgsz)
      538 +                        mmu_flush_tlb_page(va);
      539 +                return;
      540 +        }
      541 +
      542 +        /*
      543 +         * As an emulated invpcid() in the PCIDE case requires jumping
      544 +         * cr3s, we batch the invalidations.  We should only need to flush the
      545 +         * user range if we're on a user-space HAT.
      546 +         */
      547 +        if (addr < kernelbase && ON_USER_HAT(CPU)) {
      548 +                ulong_t flag = intr_clear();
      549 +                ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
      550 +                tr_mmu_flush_user_range(addr, len, pgsz,
      551 +                    CPU->cpu_m.mcpu_kpti.kf_user_cr3);
      552 +                intr_restore(flag);
      553 +        }
      554 +
      555 +        for (uintptr_t va = addr; va < (addr + len); va += pgsz)
      556 +                mmu_invlpg((caddr_t)va);
      557 +}
      558 +
      559 +/*
      560 + * MMU TLB (and PT cache) flushing on this CPU.
      561 + *
      562 + * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
      563 + * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
      564 + * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
      565 + * mappings as appropriate.  If using invpcid, PT_GLOBAL mappings are not
      566 + * invalidated.
      567 + */
      568 +void
      569 +mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
      570 +{
      571 +        ASSERT(getpcid() == PCID_KERNEL);
      572 +
      573 +        switch (type) {
      574 +        case FLUSH_TLB_ALL:
      575 +                ASSERT(range == NULL);
      576 +                invpcid(INVPCID_ALL_GLOBAL, 0, 0);
      577 +                break;
      578 +
      579 +        case FLUSH_TLB_NONGLOBAL:
      580 +                ASSERT(range == NULL);
      581 +                invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
      582 +                break;
      583 +
      584 +        case FLUSH_TLB_RANGE: {
      585 +                mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
      586 +                    LEVEL_SIZE(range->tr_level));
      587 +                break;
      588 +        }
      589 +
      590 +        default:
      591 +                panic("invalid call mmu_flush_tlb(%d)", type);
      592 +                break;
      593 +        }
      594 +}
      595 +
      596 +#endif /* ! __xpv */
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX